Backport PR #38587: ENH: Raise ParserWarning when length of names does not match length of data (#42047)

meeseeksmachine · phofl · web-flow · commit 76a28a0d2106 · 2021-06-16T09:03:28.000+01:00
Co-authored-by: Patrick Hoefler &lt;61934744+phofl@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -246,6 +246,7 @@ Other enhancements
 - Improved error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`)
 - Improved consistency of error messages when passing an invalid ``win_type`` argument in :ref:`Window methods <api.window>` (:issue:`15969`)
 - :func:`read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
+- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when ``usecols`` is not specified (:issue:`21768`)
 - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
 - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
 - Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -23,6 +23,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import (
+    ArrayLike,
     DtypeArg,
     FilePathOrBuffer,
     final,
@@ -803,6 +804,29 @@ def _do_date_conversions(self, names, data):
 
         return names, data
 
+    def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None:
+        """Checks if length of data is equal to length of column names.
+
+        One set of trailing commas is allowed. self.index_col not False
+        results in a ParserError previously when lengths do not match.
+
+        Parameters
+        ----------
+        columns: list of column names
+        data: list of array-likes containing the data column-wise.
+        """
+        if not self.index_col and len(columns) != len(data) and columns:
+            if len(columns) == len(data) - 1 and np.all(
+                (is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1])
+            ):
+                return
+            warnings.warn(
+                "Length of header or names does not match length of data. This leads "
+                "to a loss of data with index_col=False.",
+                ParserWarning,
+                stacklevel=6,
+            )
+
     def _evaluate_usecols(self, usecols, names):
         """
         Check whether or not the 'usecols' parameter
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -300,6 +300,8 @@ def read(self, nrows=None):
 
             # columns as list
             alldata = [x[1] for x in data_tups]
+            if self.usecols is None:
+                self._check_data_length(names, alldata)
 
             data = {k: v for k, (i, v) in zip(names, data_tups)}
 
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -293,6 +293,8 @@ def _exclude_implicit_index(self, alldata):
             offset = len(self.index_col)  # type: ignore[has-type]
 
         len_alldata = len(alldata)
+        self._check_data_length(names, alldata)
+
         return {
             name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata
         }, names
diff --git a/pandas/tests/io/parser/common/__init__.py b/pandas/tests/io/parser/common/__init__.py
diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
@@ -143,10 +143,7 @@ def test_read_chunksize_jagged_names(all_parsers):
     parser = all_parsers
     data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
 
-    # error: List item 0 has incompatible type "float"; expected "int"
-    expected = DataFrame(
-        [[0] + [np.nan] * 9] * 7 + [[0] * 10]  # type: ignore[list-item]
-    )
+    expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
     with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
         result = concat(reader)
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -15,6 +15,7 @@
 from pandas.errors import (
     EmptyDataError,
     ParserError,
+    ParserWarning,
 )
 
 from pandas import (
@@ -685,7 +686,8 @@ def test_no_header_two_extra_columns(all_parsers):
     ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
     stream = StringIO("foo,bar,baz,bam,blah")
     parser = all_parsers
-    df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
+    with tm.assert_produces_warning(ParserWarning):
+        df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
     tm.assert_frame_equal(df, ref)
 
 
diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
@@ -383,7 +383,9 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
 a,b
 1,2
     """
-    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+    with tm.assert_produces_warning(
+        FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
+    ):
         result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
     expected = DataFrame({"a": [1], "b": [None]})
     if names is None and parser.engine == "python":