-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Implement arrow string option for various I/O methods #54431
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
3904245
ebe0bd5
0889028
28ace4b
f2b5992
35a8240
b677a89
11b267e
0f79a2f
8072a86
bed3124
efb6f4a
0ac28a1
ff38a29
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1299,6 +1299,7 @@ cdef class Seen: | |
bint datetimetz_ # seen_datetimetz | ||
bint period_ # seen_period | ||
bint interval_ # seen_interval | ||
bint str_ # seen_str | ||
|
||
def __cinit__(self, bint coerce_numeric=False): | ||
""" | ||
|
@@ -1325,6 +1326,7 @@ cdef class Seen: | |
self.datetimetz_ = False | ||
self.period_ = False | ||
self.interval_ = False | ||
self.str_ = False | ||
self.coerce_numeric = coerce_numeric | ||
|
||
cdef bint check_uint64_conflict(self) except -1: | ||
|
@@ -2615,6 +2617,13 @@ def maybe_convert_objects(ndarray[object] objects, | |
else: | ||
seen.object_ = True | ||
break | ||
elif isinstance(val, str): | ||
if convert_non_numeric: | ||
seen.str_ = True | ||
break | ||
else: | ||
seen.object_ = True | ||
break | ||
else: | ||
seen.object_ = True | ||
break | ||
|
@@ -2669,6 +2678,20 @@ def maybe_convert_objects(ndarray[object] objects, | |
return pi._data | ||
seen.object_ = True | ||
|
||
elif seen.str_: | ||
if is_string_array(objects): | ||
from pandas._config import get_option | ||
opt = get_option("future.infer_string") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we pass this in as a kwarg to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather only get the option if actually needed |
||
if opt is True: | ||
import pyarrow as pa | ||
|
||
from pandas.core.dtypes.dtypes import ArrowDtype | ||
|
||
obj = pa.array(objects) | ||
dtype = ArrowDtype(obj.type) | ||
return dtype.construct_array_type()(obj) | ||
|
||
seen.object_ = True | ||
elif seen.interval_: | ||
if is_interval_array(objects): | ||
from pandas import IntervalIndex | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,8 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._config import get_option | ||
|
||
from pandas._libs import lib | ||
from pandas._libs.missing import ( | ||
NA, | ||
|
@@ -796,6 +798,12 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: | |
# coming out as np.str_! | ||
|
||
dtype = _dtype_obj | ||
opt = get_option("future.infer_string") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. follow-up. This is introduced in the other pr (little bit confusing, sorry) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
if opt is True: | ||
import pyarrow as pa | ||
|
||
pa_dtype = pa.string() | ||
dtype = ArrowDtype(pa_dtype) | ||
|
||
elif isinstance(val, (np.datetime64, dt.datetime)): | ||
try: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Callable | ||
|
||
from pandas.compat._optional import import_optional_dependency | ||
|
||
import pandas as pd | ||
|
@@ -21,3 +23,9 @@ def _arrow_dtype_mapping() -> dict: | |
pa.float32(): pd.Float32Dtype(), | ||
pa.float64(): pd.Float64Dtype(), | ||
} | ||
|
||
|
||
def arrow_string_types_mapper() -> Callable: | ||
pa = import_optional_dependency("pyarrow") | ||
|
||
return {pa.string(): pd.ArrowDtype(pa.string())}.get | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking about this a little, is there a situation where you would want to mix pyarrow and numpy dtypes? (I'm thinking maybe we should force users to pick the pyarrow dtype backend if you are using the pyarrow string type) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes there are a lot of situations. NumPy numeric and Arrow strings is still the fastest, numpy numeric is 2D. Forcing them right now is not a good idea |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,8 @@ | |
Literal, | ||
) | ||
|
||
from pandas._config import using_pyarrow_string_dtype | ||
|
||
from pandas._libs import lib | ||
from pandas.compat import pa_version_under8p0 | ||
from pandas.compat._optional import import_optional_dependency | ||
|
@@ -24,6 +26,7 @@ | |
import pandas as pd | ||
from pandas.core.indexes.api import default_index | ||
|
||
from pandas.io._util import arrow_string_types_mapper | ||
from pandas.io.common import ( | ||
get_handle, | ||
is_fsspec_url, | ||
|
@@ -132,7 +135,12 @@ def read_orc( | |
df = pa_table.to_pandas(types_mapper=mapping.get) | ||
return df | ||
else: | ||
return pa_table.to_pandas() | ||
print("Ts") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove |
||
if using_pyarrow_string_dtype(): | ||
types_mapper = arrow_string_types_mapper() | ||
else: | ||
types_mapper = None | ||
return pa_table.to_pandas(types_mapper=types_mapper) | ||
|
||
|
||
def to_orc( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ | |
from pandas._config import ( | ||
config, | ||
get_option, | ||
using_pyarrow_string_dtype, | ||
) | ||
|
||
from pandas._libs import ( | ||
|
@@ -66,6 +67,7 @@ | |
) | ||
from pandas.core.dtypes.missing import array_equivalent | ||
|
||
import pandas as pd | ||
from pandas import ( | ||
DataFrame, | ||
DatetimeIndex, | ||
|
@@ -3219,7 +3221,12 @@ def read( | |
self.validate_read(columns, where) | ||
index = self.read_index("index", start=start, stop=stop) | ||
values = self.read_array("values", start=start, stop=stop) | ||
return Series(values, index=index, name=self.name, copy=False) | ||
result = Series(values, index=index, name=self.name, copy=False) | ||
if result.dtype.kind == "O" and using_pyarrow_string_dtype(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not too familiar with this code, but do we need to check if results is a string array first if doing this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah that makes sense |
||
import pyarrow as pa | ||
|
||
result = result.astype(pd.ArrowDtype(pa.string())) | ||
return result | ||
|
||
# error: Signature of "write" incompatible with supertype "Fixed" | ||
def write(self, obj, **kwargs) -> None: # type: ignore[override] | ||
|
@@ -3287,6 +3294,10 @@ def read( | |
|
||
columns = items[items.get_indexer(blk_items)] | ||
df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) | ||
if values.dtype.kind == "O" and using_pyarrow_string_dtype(): | ||
import pyarrow as pa | ||
|
||
df = df.astype(pd.ArrowDtype(pa.string())) | ||
dfs.append(df) | ||
|
||
if len(dfs) > 0: | ||
|
@@ -4669,6 +4680,10 @@ def read( | |
# Categorical | ||
df = DataFrame._from_arrays([values], columns=cols_, index=index_) | ||
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) | ||
if values.dtype.kind == "O" and using_pyarrow_string_dtype(): | ||
import pyarrow as pa | ||
|
||
df = df.astype(pd.ArrowDtype(pa.string())) | ||
frames.append(df) | ||
|
||
if len(frames) == 1: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -538,3 +538,22 @@ def test_ea_int_avoid_overflow(all_parsers): | |
} | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_string_inference(all_parsers): | ||
# GH#54430 | ||
pa = pytest.importorskip("pyarrow") | ||
dtype = pd.ArrowDtype(pa.string()) | ||
|
||
data = """a,b | ||
x,1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a test case with null/nan/None like in your other PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can add a missing field, actually having these values doesn't make much sense There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
y,2""" | ||
parser = all_parsers | ||
with pd.option_context("future.infer_string", True): | ||
result = parser.read_csv(StringIO(data)) | ||
|
||
expected = DataFrame( | ||
{"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]}, | ||
columns=pd.Index(["a", "b"], dtype=dtype), | ||
) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know everywhere else does this, but is there a way to avoid this double parsing?
(Maybe we check the other flags are all false?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, you exit the first loop as soon as you find one string