Skip to content

Commit 89369f5

Browse files
author
Abel Tavares
committed
ENH: DtypeWarning message enhancement
1 parent b9bfc01 commit 89369f5

File tree

4 files changed

+17
-8
lines changed

4 files changed

+17
-8
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Other enhancements
3737
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
3838
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
3939
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
40+
- ``DtypeWarning`` improved to include column names when mixed data types are detected (:issue:`58174`)
4041

4142
.. ---------------------------------------------------------------------------
4243
.. _whatsnew_300.notable_bug_fixes:

pandas/io/parsers/c_parser_wrapper.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ def read(
235235
if self.low_memory:
236236
chunks = self._reader.read_low_memory(nrows)
237237
# destructive to chunks
238-
data = _concatenate_chunks(chunks)
238+
data = _concatenate_chunks(chunks, self.names) # type: ignore[has-type]
239239

240240
else:
241241
data = self._reader.read(nrows)
@@ -358,7 +358,9 @@ def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
358358
return values
359359

360360

361-
def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
361+
def _concatenate_chunks(
362+
chunks: list[dict[int, ArrayLike]], column_names: list[str]
363+
) -> dict:
362364
"""
363365
Concatenate chunks of data read with low_memory=True.
364366
@@ -381,10 +383,12 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
381383
else:
382384
result[name] = concat_compat(arrs)
383385
if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
384-
warning_columns.append(str(name))
386+
warning_columns.append(column_names[name])
385387

386388
if warning_columns:
387-
warning_names = ",".join(warning_columns)
389+
warning_names = ", ".join(
390+
[f"{index}: {name}" for index, name in enumerate(warning_columns, start=0)]
391+
)
388392
warning_message = " ".join(
389393
[
390394
f"Columns ({warning_names}) have mixed types. "

pandas/tests/io/parser/common/test_chunksize.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
253253
else:
254254
df = parser.read_csv_check_warnings(
255255
warning_type,
256-
r"Columns \(0\) have mixed types. "
256+
r"Columns \(0: a\) have mixed types. "
257257
"Specify dtype option on import or set low_memory=False.",
258258
buf,
259259
)

pandas/tests/io/parser/test_concatenate_chunks.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
1010

11+
COLUMN_NAMES = ["column_0", "column_1"]
12+
1113

1214
def test_concatenate_chunks_pyarrow():
1315
# GH#51876
@@ -16,7 +18,7 @@ def test_concatenate_chunks_pyarrow():
1618
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
1719
{0: ArrowExtensionArray(pa.array([1, 2]))},
1820
]
19-
result = _concatenate_chunks(chunks)
21+
result = _concatenate_chunks(chunks, COLUMN_NAMES)
2022
expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
2123
tm.assert_extension_array_equal(result[0], expected)
2224

@@ -28,8 +30,10 @@ def test_concatenate_chunks_pyarrow_strings():
2830
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
2931
{0: ArrowExtensionArray(pa.array(["a", "b"]))},
3032
]
31-
with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
32-
result = _concatenate_chunks(chunks)
33+
with tm.assert_produces_warning(
34+
DtypeWarning, match="Columns \\(0: column_0\\) have mixed types"
35+
):
36+
result = _concatenate_chunks(chunks, COLUMN_NAMES)
3337
expected = np.concatenate(
3438
[np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
3539
)

0 commit comments

Comments
 (0)