ENH: DtypeWarning message enhancement

Abel Tavares · Abel Tavares · commit 89369f568f24 · 2024-04-13T19:40:16.000+01:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -37,6 +37,7 @@ Other enhancements
 - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
+- ``DtypeWarning`` improved to include column names when mixed data types are detected (:issue:`58174`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.notable_bug_fixes:
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -235,7 +235,7 @@ def read(
             if self.low_memory:
                 chunks = self._reader.read_low_memory(nrows)
                 # destructive to chunks
-                data = _concatenate_chunks(chunks)
+                data = _concatenate_chunks(chunks, self.names)  # type: ignore[has-type]
 
             else:
                 data = self._reader.read(nrows)
@@ -358,7 +358,9 @@ def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
         return values
 
 
-def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
+def _concatenate_chunks(
+    chunks: list[dict[int, ArrayLike]], column_names: list[str]
+) -> dict:
     """
     Concatenate chunks of data read with low_memory=True.
 
@@ -381,10 +383,12 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
         else:
             result[name] = concat_compat(arrs)
             if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
-                warning_columns.append(str(name))
+                warning_columns.append(column_names[name])
 
     if warning_columns:
-        warning_names = ",".join(warning_columns)
+        warning_names = ", ".join(
+            [f"{index}: {name}" for index, name in enumerate(warning_columns, start=0)]
+        )
         warning_message = " ".join(
             [
                 f"Columns ({warning_names}) have mixed types. "
diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
@@ -253,7 +253,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
     else:
         df = parser.read_csv_check_warnings(
             warning_type,
-            r"Columns \(0\) have mixed types. "
+            r"Columns \(0: a\) have mixed types. "
             "Specify dtype option on import or set low_memory=False.",
             buf,
         )
diff --git a/pandas/tests/io/parser/test_concatenate_chunks.py b/pandas/tests/io/parser/test_concatenate_chunks.py
@@ -8,6 +8,8 @@
 
 from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
 
+COLUMN_NAMES = ["column_0", "column_1"]
+
 
 def test_concatenate_chunks_pyarrow():
     # GH#51876
@@ -16,7 +18,7 @@ def test_concatenate_chunks_pyarrow():
         {0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
         {0: ArrowExtensionArray(pa.array([1, 2]))},
     ]
-    result = _concatenate_chunks(chunks)
+    result = _concatenate_chunks(chunks, COLUMN_NAMES)
     expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
     tm.assert_extension_array_equal(result[0], expected)
 
@@ -28,8 +30,10 @@ def test_concatenate_chunks_pyarrow_strings():
         {0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
         {0: ArrowExtensionArray(pa.array(["a", "b"]))},
     ]
-    with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
-        result = _concatenate_chunks(chunks)
+    with tm.assert_produces_warning(
+        DtypeWarning, match="Columns \\(0: column_0\\) have mixed types"
+    ):
+        result = _concatenate_chunks(chunks, COLUMN_NAMES)
     expected = np.concatenate(
         [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
     )

Original file line number	Diff line number	Diff line change
`@@ -253,7 +253,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):`
`253`	`253`	`else:`
`254`	`254`	`df = parser.read_csv_check_warnings(`
`255`	`255`	`warning_type,`
`256`		`- r"Columns \(0\) have mixed types. "`
	`256`	`+ r"Columns \(0: a\) have mixed types. "`
`257`	`257`	`"Specify dtype option on import or set low_memory=False.",`
`258`	`258`	`buf,`
`259`	`259`	`)`