Skip to content

ENH: Implement arrow string option for various I/O methods #54431

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Aug 10, 2023
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,8 @@ def using_copy_on_write() -> bool:
def using_nullable_dtypes() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["nullable_dtypes"]


def using_pyarrow_string_dtype() -> bool:
_mode_options = _global_config["future"]
return _mode_options["infer_string"]
6 changes: 3 additions & 3 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ from cython cimport (
floating,
)

from pandas._config import using_pyarrow_string_dtype

from pandas._libs.missing import check_na_tuples_nonequal

import_datetime()
Expand Down Expand Up @@ -2680,9 +2682,7 @@ def maybe_convert_objects(ndarray[object] objects,

elif seen.str_:
if is_string_array(objects, skipna=True):
from pandas._config import get_option
opt = get_option("future.infer_string")
if opt is True:
if using_pyarrow_string_dtype():
import pyarrow as pa

from pandas.core.dtypes.dtypes import ArrowDtype
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import numpy as np

from pandas._config import get_option
from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas._libs.missing import (
Expand Down Expand Up @@ -798,8 +798,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
# coming out as np.str_!

dtype = _dtype_obj
opt = get_option("future.infer_string")
if opt is True:
if using_pyarrow_string_dtype():
import pyarrow as pa

pa_dtype = pa.string()
Expand Down
8 changes: 8 additions & 0 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from typing import Callable

from pandas.compat._optional import import_optional_dependency

import pandas as pd
Expand All @@ -21,3 +23,9 @@ def _arrow_dtype_mapping() -> dict:
pa.float32(): pd.Float32Dtype(),
pa.float64(): pd.Float64Dtype(),
}


def arrow_string_types_mapper() -> Callable:
pa = import_optional_dependency("pyarrow")

return {pa.string(): pd.ArrowDtype(pa.string())}.get
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking about this a little, is there a situation where you would want to mix pyarrow and numpy dtypes?

(I'm thinking maybe we should force users to pick the pyarrow dtype backend if you are using the pyarrow string type)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes there are a lot of situations.

NumPy numeric and Arrow strings is still the fastest, numpy numeric is 2D. Forcing them right now is not a good idea

10 changes: 9 additions & 1 deletion pandas/io/feather_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
Any,
)

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
Expand All @@ -15,6 +17,7 @@
from pandas.core.api import DataFrame
from pandas.core.shared_docs import _shared_docs

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import get_handle

if TYPE_CHECKING:
Expand Down Expand Up @@ -119,7 +122,7 @@ def read_feather(
with get_handle(
path, "rb", storage_options=storage_options, is_text=False
) as handles:
if dtype_backend is lib.no_default:
if dtype_backend is lib.no_default and not using_pyarrow_string_dtype():
return feather.read_feather(
handles.handle, columns=columns, use_threads=bool(use_threads)
)
Expand All @@ -135,3 +138,8 @@ def read_feather(

elif dtype_backend == "pyarrow":
return pa_table.to_pandas(types_mapper=pd.ArrowDtype)

elif using_pyarrow_string_dtype():
return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
raise NotImplementedError
9 changes: 8 additions & 1 deletion pandas/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
Literal,
)

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat import pa_version_under8p0
from pandas.compat._optional import import_optional_dependency
Expand All @@ -24,6 +26,7 @@
import pandas as pd
from pandas.core.indexes.api import default_index

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
get_handle,
is_fsspec_url,
Expand Down Expand Up @@ -132,7 +135,11 @@ def read_orc(
df = pa_table.to_pandas(types_mapper=mapping.get)
return df
else:
return pa_table.to_pandas()
if using_pyarrow_string_dtype():
types_mapper = arrow_string_types_mapper()
else:
types_mapper = None
return pa_table.to_pandas(types_mapper=types_mapper)


def to_orc(
Expand Down
5 changes: 5 additions & 0 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import warnings
from warnings import catch_warnings

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
Expand All @@ -26,6 +28,7 @@
)
from pandas.core.shared_docs import _shared_docs

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
IOHandles,
get_handle,
Expand Down Expand Up @@ -252,6 +255,8 @@ def read(
to_pandas_kwargs["types_mapper"] = mapping.get
elif dtype_backend == "pyarrow":
to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa: E501
elif using_pyarrow_string_dtype():
to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper()

manager = get_option("mode.data_manager")
if manager == "array":
Expand Down
9 changes: 8 additions & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from typing import TYPE_CHECKING

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency

Expand All @@ -10,7 +12,10 @@
import pandas as pd
from pandas import DataFrame

from pandas.io._util import _arrow_dtype_mapping
from pandas.io._util import (
_arrow_dtype_mapping,
arrow_string_types_mapper,
)
from pandas.io.parsers.base_parser import ParserBase

if TYPE_CHECKING:
Expand Down Expand Up @@ -215,6 +220,8 @@ def read(self) -> DataFrame:
dtype_mapping = _arrow_dtype_mapping()
dtype_mapping[pa.null()] = pd.Int64Dtype()
frame = table.to_pandas(types_mapper=dtype_mapping.get)
elif using_pyarrow_string_dtype():
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
frame = table.to_pandas()
return self._finalize_pandas_output(frame)
24 changes: 22 additions & 2 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@
from pandas._config import (
config,
get_option,
using_pyarrow_string_dtype,
)

from pandas._libs import (
lib,
writers as libwriters,
)
from pandas._libs.lib import is_string_array
from pandas._libs.tslibs import timezones
from pandas.compat._optional import import_optional_dependency
from pandas.compat.pickle_compat import patch_pickle
Expand Down Expand Up @@ -66,6 +68,7 @@
)
from pandas.core.dtypes.missing import array_equivalent

import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Expand Down Expand Up @@ -3219,7 +3222,12 @@ def read(
self.validate_read(columns, where)
index = self.read_index("index", start=start, stop=stop)
values = self.read_array("values", start=start, stop=stop)
return Series(values, index=index, name=self.name, copy=False)
result = Series(values, index=index, name=self.name, copy=False)
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
import pyarrow as pa

result = result.astype(pd.ArrowDtype(pa.string()))
return result

# error: Signature of "write" incompatible with supertype "Fixed"
def write(self, obj, **kwargs) -> None: # type: ignore[override]
Expand Down Expand Up @@ -3287,6 +3295,10 @@ def read(

columns = items[items.get_indexer(blk_items)]
df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
import pyarrow as pa

df = df.astype(pd.ArrowDtype(pa.string()))
dfs.append(df)

if len(dfs) > 0:
Expand Down Expand Up @@ -4668,7 +4680,15 @@ def read(
else:
# Categorical
df = DataFrame._from_arrays([values], columns=cols_, index=index_)
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"):
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
if using_pyarrow_string_dtype() and is_string_array(
values, # type: ignore[arg-type]
skipna=True,
):
import pyarrow as pa

df = df.astype(pd.ArrowDtype(pa.string()))
frames.append(df)

if len(frames) == 1:
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,15 +547,14 @@ def test_string_inference(all_parsers):

data = """a,b
x,1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test case with null/nan/None like in your other PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can add a missing field, actually having these values doesn't make much sense

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

y,2"""
y,2
,3"""
parser = all_parsers
if parser.engine == "pyarrow":
pytest.skip("TODO: Follow up")
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data))

expected = DataFrame(
{"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]},
{"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
columns=pd.Index(["a", "b"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)
16 changes: 16 additions & 0 deletions pandas/tests/io/pytables/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,3 +388,19 @@ def test_read_py2_hdf_file_in_py3(datapath):
) as store:
result = store["p"]
tm.assert_frame_equal(result, expected)


def test_read_infer_string(tmp_path, setup_path):
# GH#54431
pa = pytest.importorskip("pyarrow")
df = DataFrame({"a": ["a", "b", None]})
path = tmp_path / setup_path
df.to_hdf(path, key="data", format="table")
with pd.option_context("future.infer_string", True):
result = read_hdf(path, key="data", mode="r")
expected = DataFrame(
{"a": ["a", "b", None]},
dtype=pd.ArrowDtype(pa.string()),
columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())),
)
tm.assert_frame_equal(result, expected)
14 changes: 14 additions & 0 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,17 @@ def test_invalid_dtype_backend(self):
df.to_feather(path)
with pytest.raises(ValueError, match=msg):
read_feather(path, dtype_backend="numpy")

def test_string_inference(self, tmp_path):
# GH#54431
import pyarrow as pa

path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_feather(path)
with pd.option_context("future.infer_string", True):
result = read_feather(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string())
)
tm.assert_frame_equal(result, expected)
15 changes: 15 additions & 0 deletions pandas/tests/io/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,18 @@ def test_invalid_dtype_backend():
df.to_orc(path)
with pytest.raises(ValueError, match=msg):
read_orc(path, dtype_backend="numpy")


def test_string_inference(tmp_path):
# GH#54431
path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_orc(path)
with pd.option_context("future.infer_string", True):
result = read_orc(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=pd.ArrowDtype(pa.string()),
columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())),
)
tm.assert_frame_equal(result, expected)
16 changes: 16 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,22 @@ def test_df_attrs_persistence(self, tmp_path, pa):
new_df = read_parquet(path, engine=pa)
assert new_df.attrs == df.attrs

def test_string_inference(self, tmp_path, pa):
# GH#54431
import pyarrow as pa

path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
df.to_parquet(path, engine="pyarrow")
with pd.option_context("future.infer_string", True):
result = read_parquet(path, engine="pyarrow")
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=pd.ArrowDtype(pa.string()),
index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
)
tm.assert_frame_equal(result, expected)


class TestParquetFastParquet(Base):
def test_basic(self, fp, df_full):
Expand Down