Skip to content

ENH: Implement arrow string option for various I/O methods #54431

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Aug 10, 2023
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,8 @@ def using_copy_on_write() -> bool:
def using_nullable_dtypes() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["nullable_dtypes"]


def using_pyarrow_string_dtype() -> bool:
_mode_options = _global_config["future"]
return _mode_options["infer_string"]
23 changes: 23 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,7 @@ cdef class Seen:
bint datetimetz_ # seen_datetimetz
bint period_ # seen_period
bint interval_ # seen_interval
bint str_ # seen_str

def __cinit__(self, bint coerce_numeric=False):
"""
Expand All @@ -1325,6 +1326,7 @@ cdef class Seen:
self.datetimetz_ = False
self.period_ = False
self.interval_ = False
self.str_ = False
self.coerce_numeric = coerce_numeric

cdef bint check_uint64_conflict(self) except -1:
Expand Down Expand Up @@ -2615,6 +2617,13 @@ def maybe_convert_objects(ndarray[object] objects,
else:
seen.object_ = True
break
elif isinstance(val, str):
if convert_non_numeric:
seen.str_ = True
break
else:
seen.object_ = True
break
else:
seen.object_ = True
break
Expand Down Expand Up @@ -2669,6 +2678,20 @@ def maybe_convert_objects(ndarray[object] objects,
return pi._data
seen.object_ = True

elif seen.str_:
if is_string_array(objects):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know everywhere else does this, but is there a way to avoid this double parsing?

(Maybe we check the other flags are all false?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, you exit the first loop as soon as you find one string

from pandas._config import get_option
opt = get_option("future.infer_string")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we pass this in as a kwarg to maybe_convert_objects instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather only get the option if actually needed

if opt is True:
import pyarrow as pa

from pandas.core.dtypes.dtypes import ArrowDtype

obj = pa.array(objects)
dtype = ArrowDtype(obj.type)
return dtype.construct_array_type()(obj)

seen.object_ = True
elif seen.interval_:
if is_interval_array(objects):
from pandas import IntervalIndex
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,3 +889,14 @@ def register_converter_cb(key) -> None:
styler_environment,
validator=is_instance_factory([type(None), str]),
)


with cf.config_prefix("future"):
cf.register_option(
"infer_string",
False,
"Whether to infer sequence of str objects as pyarrow string "
"dtype, which will be the default in pandas 3.0 "
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False]),
)
8 changes: 8 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import numpy as np

from pandas._config import get_option

from pandas._libs import lib
from pandas._libs.missing import (
NA,
Expand Down Expand Up @@ -796,6 +798,12 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
# coming out as np.str_!

dtype = _dtype_obj
opt = get_option("future.infer_string")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using_pyarrow_string_dtype?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

follow-up. This is introduced in the other pr (little bit confusing, sorry)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

if opt is True:
import pyarrow as pa

pa_dtype = pa.string()
dtype = ArrowDtype(pa_dtype)

elif isinstance(val, (np.datetime64, dt.datetime)):
try:
Expand Down
8 changes: 8 additions & 0 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from typing import Callable

from pandas.compat._optional import import_optional_dependency

import pandas as pd
Expand All @@ -21,3 +23,9 @@ def _arrow_dtype_mapping() -> dict:
pa.float32(): pd.Float32Dtype(),
pa.float64(): pd.Float64Dtype(),
}


def arrow_string_types_mapper() -> Callable:
pa = import_optional_dependency("pyarrow")

return {pa.string(): pd.ArrowDtype(pa.string())}.get
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking about this a little, is there a situation where you would want to mix pyarrow and numpy dtypes?

(I'm thinking maybe we should force users to pick the pyarrow dtype backend if you are using the pyarrow string type)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes there are a lot of situations.

NumPy numeric and Arrow strings is still the fastest, numpy numeric is 2D. Forcing them right now is not a good idea

10 changes: 9 additions & 1 deletion pandas/io/feather_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
Any,
)

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
Expand All @@ -15,6 +17,7 @@
from pandas.core.api import DataFrame
from pandas.core.shared_docs import _shared_docs

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import get_handle

if TYPE_CHECKING:
Expand Down Expand Up @@ -119,7 +122,7 @@ def read_feather(
with get_handle(
path, "rb", storage_options=storage_options, is_text=False
) as handles:
if dtype_backend is lib.no_default:
if dtype_backend is lib.no_default and not using_pyarrow_string_dtype():
return feather.read_feather(
handles.handle, columns=columns, use_threads=bool(use_threads)
)
Expand All @@ -135,3 +138,8 @@ def read_feather(

elif dtype_backend == "pyarrow":
return pa_table.to_pandas(types_mapper=pd.ArrowDtype)

elif using_pyarrow_string_dtype():
return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
raise NotImplementedError
10 changes: 9 additions & 1 deletion pandas/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
Literal,
)

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat import pa_version_under8p0
from pandas.compat._optional import import_optional_dependency
Expand All @@ -24,6 +26,7 @@
import pandas as pd
from pandas.core.indexes.api import default_index

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
get_handle,
is_fsspec_url,
Expand Down Expand Up @@ -132,7 +135,12 @@ def read_orc(
df = pa_table.to_pandas(types_mapper=mapping.get)
return df
else:
return pa_table.to_pandas()
print("Ts")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove

if using_pyarrow_string_dtype():
types_mapper = arrow_string_types_mapper()
else:
types_mapper = None
return pa_table.to_pandas(types_mapper=types_mapper)


def to_orc(
Expand Down
5 changes: 5 additions & 0 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import warnings
from warnings import catch_warnings

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
Expand All @@ -26,6 +28,7 @@
)
from pandas.core.shared_docs import _shared_docs

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
IOHandles,
get_handle,
Expand Down Expand Up @@ -252,6 +255,8 @@ def read(
to_pandas_kwargs["types_mapper"] = mapping.get
elif dtype_backend == "pyarrow":
to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa: E501
elif using_pyarrow_string_dtype():
to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper()

manager = get_option("mode.data_manager")
if manager == "array":
Expand Down
9 changes: 8 additions & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from typing import TYPE_CHECKING

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency

Expand All @@ -10,7 +12,10 @@
import pandas as pd
from pandas import DataFrame

from pandas.io._util import _arrow_dtype_mapping
from pandas.io._util import (
_arrow_dtype_mapping,
arrow_string_types_mapper,
)
from pandas.io.parsers.base_parser import ParserBase

if TYPE_CHECKING:
Expand Down Expand Up @@ -215,6 +220,8 @@ def read(self) -> DataFrame:
dtype_mapping = _arrow_dtype_mapping()
dtype_mapping[pa.null()] = pd.Int64Dtype()
frame = table.to_pandas(types_mapper=dtype_mapping.get)
elif using_pyarrow_string_dtype():
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
frame = table.to_pandas()
return self._finalize_pandas_output(frame)
17 changes: 16 additions & 1 deletion pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pandas._config import (
config,
get_option,
using_pyarrow_string_dtype,
)

from pandas._libs import (
Expand Down Expand Up @@ -66,6 +67,7 @@
)
from pandas.core.dtypes.missing import array_equivalent

import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Expand Down Expand Up @@ -3219,7 +3221,12 @@ def read(
self.validate_read(columns, where)
index = self.read_index("index", start=start, stop=stop)
values = self.read_array("values", start=start, stop=stop)
return Series(values, index=index, name=self.name, copy=False)
result = Series(values, index=index, name=self.name, copy=False)
if result.dtype.kind == "O" and using_pyarrow_string_dtype():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not too familiar with this code, but do we need to check if results is a string array first if doing this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that makes sense

import pyarrow as pa

result = result.astype(pd.ArrowDtype(pa.string()))
return result

# error: Signature of "write" incompatible with supertype "Fixed"
def write(self, obj, **kwargs) -> None: # type: ignore[override]
Expand Down Expand Up @@ -3287,6 +3294,10 @@ def read(

columns = items[items.get_indexer(blk_items)]
df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
if values.dtype.kind == "O" and using_pyarrow_string_dtype():
import pyarrow as pa

df = df.astype(pd.ArrowDtype(pa.string()))
dfs.append(df)

if len(dfs) > 0:
Expand Down Expand Up @@ -4669,6 +4680,10 @@ def read(
# Categorical
df = DataFrame._from_arrays([values], columns=cols_, index=index_)
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
if values.dtype.kind == "O" and using_pyarrow_string_dtype():
import pyarrow as pa

df = df.astype(pd.ArrowDtype(pa.string()))
frames.append(df)

if len(frames) == 1:
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2689,6 +2689,41 @@ def test_construct_with_strings_and_none(self):
expected = DataFrame({"a": ["1", "2", None]}, dtype="str")
tm.assert_frame_equal(df, expected)

def test_frame_string_inference(self):
# GH#54430
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
with pd.option_context("future.infer_string", True):
df = DataFrame({"a": ["a", "b"]})
tm.assert_frame_equal(df, expected)

expected = DataFrame(
{"a": ["a", "b"]},
dtype=dtype,
columns=Index(["a"], dtype=dtype),
index=Index(["x", "y"], dtype=dtype),
)
with pd.option_context("future.infer_string", True):
df = DataFrame({"a": ["a", "b"]}, index=["x", "y"])
tm.assert_frame_equal(df, expected)

expected = DataFrame(
{"a": ["a", 1]}, dtype="object", columns=Index(["a"], dtype=dtype)
)
with pd.option_context("future.infer_string", True):
df = DataFrame({"a": ["a", 1]})
tm.assert_frame_equal(df, expected)

expected = DataFrame(
{"a": ["a", "b"]}, dtype="object", columns=Index(["a"], dtype=dtype)
)
with pd.option_context("future.infer_string", True):
df = DataFrame({"a": ["a", "b"]}, dtype="object")
tm.assert_frame_equal(df, expected)


class TestDataFrameConstructorIndexInference:
def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/indexes/base_class/test_constructors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
Index,
MultiIndex,
Expand Down Expand Up @@ -42,3 +43,17 @@ def test_construct_empty_tuples(self, tuple_list):
expected = MultiIndex.from_tuples(tuple_list)

tm.assert_index_equal(result, expected)

def test_index_string_inference(self):
# GH#54430
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())
expected = Index(["a", "b"], dtype=dtype)
with pd.option_context("future.infer_string", True):
ser = Index(["a", "b"])
tm.assert_index_equal(ser, expected)

expected = Index(["a", 1], dtype="object")
with pd.option_context("future.infer_string", True):
ser = Index(["a", 1])
tm.assert_index_equal(ser, expected)
19 changes: 19 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,3 +538,22 @@ def test_ea_int_avoid_overflow(all_parsers):
}
)
tm.assert_frame_equal(result, expected)


def test_string_inference(all_parsers):
# GH#54430
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())

data = """a,b
x,1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test case with null/nan/None like in your other PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can add a missing field, actually having these values doesn't make much sense

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

y,2"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data))

expected = DataFrame(
{"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]},
columns=pd.Index(["a", "b"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)
14 changes: 14 additions & 0 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,17 @@ def test_invalid_dtype_backend(self):
df.to_feather(path)
with pytest.raises(ValueError, match=msg):
read_feather(path, dtype_backend="numpy")

def test_string_inference(self, tmp_path):
# GH#54431
import pyarrow as pa

path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_feather(path)
with pd.option_context("future.infer_string", True):
result = read_feather(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string())
)
tm.assert_frame_equal(result, expected)
Loading