Skip to content

Commit 76a28a0

Browse files
Backport PR #38587: ENH: Raise ParserWarning when length of names does not match length of data (#42047)
Co-authored-by: Patrick Hoefler <[email protected]>
1 parent 0e11a00 commit 76a28a0

File tree

8 files changed

+36
-6
lines changed

8 files changed

+36
-6
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ Other enhancements
246246
- Improved error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`)
247247
- Improved consistency of error messages when passing an invalid ``win_type`` argument in :ref:`Window methods <api.window>` (:issue:`15969`)
248248
- :func:`read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
249+
- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when ``usecols`` is not specified (:issue:`21768`)
249250
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
250251
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
251252
- Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`)

pandas/io/parsers/base_parser.py

+24
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pandas._libs.parsers import STR_NA_VALUES
2424
from pandas._libs.tslibs import parsing
2525
from pandas._typing import (
26+
ArrayLike,
2627
DtypeArg,
2728
FilePathOrBuffer,
2829
final,
@@ -803,6 +804,29 @@ def _do_date_conversions(self, names, data):
803804

804805
return names, data
805806

807+
def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None:
808+
"""Checks if length of data is equal to length of column names.
809+
810+
One set of trailing commas is allowed. self.index_col not False
811+
results in a ParserError previously when lengths do not match.
812+
813+
Parameters
814+
----------
815+
columns: list of column names
816+
data: list of array-likes containing the data column-wise.
817+
"""
818+
if not self.index_col and len(columns) != len(data) and columns:
819+
if len(columns) == len(data) - 1 and np.all(
820+
(is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1])
821+
):
822+
return
823+
warnings.warn(
824+
"Length of header or names does not match length of data. This leads "
825+
"to a loss of data with index_col=False.",
826+
ParserWarning,
827+
stacklevel=6,
828+
)
829+
806830
def _evaluate_usecols(self, usecols, names):
807831
"""
808832
Check whether or not the 'usecols' parameter

pandas/io/parsers/c_parser_wrapper.py

+2
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,8 @@ def read(self, nrows=None):
300300

301301
# columns as list
302302
alldata = [x[1] for x in data_tups]
303+
if self.usecols is None:
304+
self._check_data_length(names, alldata)
303305

304306
data = {k: v for k, (i, v) in zip(names, data_tups)}
305307

pandas/io/parsers/python_parser.py

+2
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ def _exclude_implicit_index(self, alldata):
293293
offset = len(self.index_col) # type: ignore[has-type]
294294

295295
len_alldata = len(alldata)
296+
self._check_data_length(names, alldata)
297+
296298
return {
297299
name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata
298300
}, names

pandas/tests/io/parser/common/__init__.py

Whitespace-only changes.

pandas/tests/io/parser/common/test_chunksize.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,7 @@ def test_read_chunksize_jagged_names(all_parsers):
143143
parser = all_parsers
144144
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
145145

146-
# error: List item 0 has incompatible type "float"; expected "int"
147-
expected = DataFrame(
148-
[[0] + [np.nan] * 9] * 7 + [[0] * 10] # type: ignore[list-item]
149-
)
146+
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
150147
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
151148
result = concat(reader)
152149
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/common/test_common_basic.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pandas.errors import (
1616
EmptyDataError,
1717
ParserError,
18+
ParserWarning,
1819
)
1920

2021
from pandas import (
@@ -685,7 +686,8 @@ def test_no_header_two_extra_columns(all_parsers):
685686
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
686687
stream = StringIO("foo,bar,baz,bam,blah")
687688
parser = all_parsers
688-
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
689+
with tm.assert_produces_warning(ParserWarning):
690+
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
689691
tm.assert_frame_equal(df, ref)
690692

691693

pandas/tests/io/parser/usecols/test_usecols_basic.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,9 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
383383
a,b
384384
1,2
385385
"""
386-
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
386+
with tm.assert_produces_warning(
387+
FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
388+
):
387389
result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
388390
expected = DataFrame({"a": [1], "b": [None]})
389391
if names is None and parser.engine == "python":

0 commit comments

Comments
 (0)