ENH: DtypeWarning message enhancement (#58250)

abeltavares · Abel Tavares · web-flow · commit e13d8080d390 · 2024-04-18T16:55:24.000-07:00
Co-authored-by: Abel Tavares &lt;abel.tavares@ctw.bmwgroup.com&gt;
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -37,6 +37,7 @@ Other enhancements
 - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)
 - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
+- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
 - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
 
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
@@ -178,7 +178,7 @@ class DtypeWarning(Warning):
     ... )  # doctest: +SKIP
     >>> df.to_csv("test.csv", index=False)  # doctest: +SKIP
     >>> df2 = pd.read_csv("test.csv")  # doctest: +SKIP
-    ... # DtypeWarning: Columns (0) have mixed types
+    ... # DtypeWarning: Columns (0: a) have mixed types
 
     Important to notice that ``df2`` will contain both `str` and `int` for the
     same input, '1'.
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -235,7 +235,7 @@ def read(
             if self.low_memory:
                 chunks = self._reader.read_low_memory(nrows)
                 # destructive to chunks
-                data = _concatenate_chunks(chunks)
+                data = _concatenate_chunks(chunks, self.names)  # type: ignore[has-type]
 
             else:
                 data = self._reader.read(nrows)
@@ -358,7 +358,9 @@ def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
         return values
 
 
-def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
+def _concatenate_chunks(
+    chunks: list[dict[int, ArrayLike]], column_names: list[str]
+) -> dict:
     """
     Concatenate chunks of data read with low_memory=True.
 
@@ -381,10 +383,12 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
         else:
             result[name] = concat_compat(arrs)
             if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
-                warning_columns.append(str(name))
+                warning_columns.append(column_names[name])
 
     if warning_columns:
-        warning_names = ",".join(warning_columns)
+        warning_names = ", ".join(
+            [f"{index}: {name}" for index, name in enumerate(warning_columns)]
+        )
         warning_message = " ".join(
             [
                 f"Columns ({warning_names}) have mixed types. "
diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
@@ -253,7 +253,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
     else:
         df = parser.read_csv_check_warnings(
             warning_type,
-            r"Columns \(0\) have mixed types. "
+            r"Columns \(0: a\) have mixed types. "
             "Specify dtype option on import or set low_memory=False.",
             buf,
         )
diff --git a/pandas/tests/io/parser/test_concatenate_chunks.py b/pandas/tests/io/parser/test_concatenate_chunks.py
@@ -16,7 +16,7 @@ def test_concatenate_chunks_pyarrow():
         {0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
         {0: ArrowExtensionArray(pa.array([1, 2]))},
     ]
-    result = _concatenate_chunks(chunks)
+    result = _concatenate_chunks(chunks, ["column_0", "column_1"])
     expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
     tm.assert_extension_array_equal(result[0], expected)
 
@@ -28,8 +28,10 @@ def test_concatenate_chunks_pyarrow_strings():
         {0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
         {0: ArrowExtensionArray(pa.array(["a", "b"]))},
     ]
-    with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
-        result = _concatenate_chunks(chunks)
+    with tm.assert_produces_warning(
+        DtypeWarning, match="Columns \\(0: column_0\\) have mixed types"
+    ):
+        result = _concatenate_chunks(chunks, ["column_0", "column_1"])
     expected = np.concatenate(
         [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
     )

Original file line number	Diff line number	Diff line change
`@@ -253,7 +253,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):`
`253`	`253`	`else:`
`254`	`254`	`df = parser.read_csv_check_warnings(`
`255`	`255`	`warning_type,`
`256`		`- r"Columns \(0\) have mixed types. "`
	`256`	`+ r"Columns \(0: a\) have mixed types. "`
`257`	`257`	`"Specify dtype option on import or set low_memory=False.",`
`258`	`258`	`buf,`
`259`	`259`	`)`