Skip to content

Commit fb05cc7

Browse files
authored
BUG: read_csv not respecting object dtype when option is set (#56047)
* BUG: read_csv not respecting object dtype when option is set * Update readers.py * Cover str too * Adjust * Fixup * Fixup * Update readers.py
1 parent ce4169a commit fb05cc7

File tree

4 files changed

+80
-14
lines changed

4 files changed

+80
-14
lines changed

doc/source/whatsnew/v2.1.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Bug fixes
2424
- Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`)
2525
- Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`)
2626
- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
27+
- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`)
2728
- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
2829
- Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`)
2930
- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)

pandas/io/parsers/arrow_parser_wrapper.py

+2-12
Original file line numberDiff line numberDiff line change
@@ -296,18 +296,8 @@ def read(self) -> DataFrame:
296296
dtype_mapping[pa.null()] = pd.Int64Dtype()
297297
frame = table.to_pandas(types_mapper=dtype_mapping.get)
298298
elif using_pyarrow_string_dtype():
299-
300-
def types_mapper(dtype):
301-
dtype_dict = self.kwds["dtype"]
302-
if dtype_dict is not None and dtype_dict.get(dtype, None) is not None:
303-
return dtype_dict.get(dtype)
304-
return arrow_string_types_mapper()(dtype)
305-
306-
frame = table.to_pandas(types_mapper=types_mapper)
299+
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
307300

308301
else:
309-
if isinstance(self.kwds.get("dtype"), dict):
310-
frame = table.to_pandas(types_mapper=self.kwds["dtype"].get)
311-
else:
312-
frame = table.to_pandas()
302+
frame = table.to_pandas()
313303
return self._finalize_pandas_output(frame)

pandas/io/parsers/readers.py

+42-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55
"""
66
from __future__ import annotations
77

8-
from collections import abc
8+
from collections import (
9+
abc,
10+
defaultdict,
11+
)
912
import csv
1013
import sys
1114
from textwrap import fill
@@ -23,6 +26,8 @@
2326

2427
import numpy as np
2528

29+
from pandas._config import using_copy_on_write
30+
2631
from pandas._libs import lib
2732
from pandas._libs.parsers import STR_NA_VALUES
2833
from pandas.errors import (
@@ -38,8 +43,10 @@
3843
is_float,
3944
is_integer,
4045
is_list_like,
46+
pandas_dtype,
4147
)
4248

49+
from pandas import Series
4350
from pandas.core.frame import DataFrame
4451
from pandas.core.indexes.api import RangeIndex
4552
from pandas.core.shared_docs import _shared_docs
@@ -1846,7 +1853,40 @@ def read(self, nrows: int | None = None) -> DataFrame:
18461853
else:
18471854
new_rows = len(index)
18481855

1849-
df = DataFrame(col_dict, columns=columns, index=index)
1856+
if hasattr(self, "orig_options"):
1857+
dtype_arg = self.orig_options.get("dtype", None)
1858+
else:
1859+
dtype_arg = None
1860+
1861+
if isinstance(dtype_arg, dict):
1862+
dtype = defaultdict(lambda: None) # type: ignore[var-annotated]
1863+
dtype.update(dtype_arg)
1864+
elif dtype_arg is not None and pandas_dtype(dtype_arg) in (
1865+
np.str_,
1866+
np.object_,
1867+
):
1868+
dtype = defaultdict(lambda: dtype_arg)
1869+
else:
1870+
dtype = None
1871+
1872+
if dtype is not None:
1873+
new_col_dict = {}
1874+
for k, v in col_dict.items():
1875+
d = (
1876+
dtype[k]
1877+
if pandas_dtype(dtype[k]) in (np.str_, np.object_)
1878+
else None
1879+
)
1880+
new_col_dict[k] = Series(v, index=index, dtype=d, copy=False)
1881+
else:
1882+
new_col_dict = col_dict
1883+
1884+
df = DataFrame(
1885+
new_col_dict,
1886+
columns=columns,
1887+
index=index,
1888+
copy=not using_copy_on_write(),
1889+
)
18501890

18511891
self._currow += new_rows
18521892
return df

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+35
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,41 @@ def test_string_inference(all_parsers):
574574
tm.assert_frame_equal(result, expected)
575575

576576

577+
@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
578+
def test_string_inference_object_dtype(all_parsers, dtype):
579+
# GH#56047
580+
pytest.importorskip("pyarrow")
581+
582+
data = """a,b
583+
x,a
584+
y,a
585+
z,a"""
586+
parser = all_parsers
587+
with pd.option_context("future.infer_string", True):
588+
result = parser.read_csv(StringIO(data), dtype=dtype)
589+
590+
expected = DataFrame(
591+
{
592+
"a": pd.Series(["x", "y", "z"], dtype=object),
593+
"b": pd.Series(["a", "a", "a"], dtype=object),
594+
},
595+
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
596+
)
597+
tm.assert_frame_equal(result, expected)
598+
599+
with pd.option_context("future.infer_string", True):
600+
result = parser.read_csv(StringIO(data), dtype={"a": dtype})
601+
602+
expected = DataFrame(
603+
{
604+
"a": pd.Series(["x", "y", "z"], dtype=object),
605+
"b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
606+
},
607+
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
608+
)
609+
tm.assert_frame_equal(result, expected)
610+
611+
577612
def test_accurate_parsing_of_large_integers(all_parsers):
578613
# GH#52505
579614
data = """SYMBOL,MOMENT,ID,ID_DEAL

0 commit comments

Comments
 (0)