-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Implement io.nullable_backend config for read_csv(engine="pyarrow") #49366
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
73d23a1
274f2f3
da5bf6e
4b90df6
9f6ad52
8bdabaa
145c46a
23a5289
fc26b63
8542faf
2076eff
e025b89
cc573ea
37de048
6c0c86c
35fb118
ce822e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,17 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
from pandas._typing import ReadBuffer | ||
from pandas.compat._optional import import_optional_dependency | ||
|
||
from pandas.core.dtypes.inference import is_integer | ||
|
||
from pandas.io.parsers.base_parser import ParserBase | ||
from pandas import ( | ||
DataFrame, | ||
arrays, | ||
get_option, | ||
) | ||
|
||
if TYPE_CHECKING: | ||
from pandas import DataFrame | ||
from pandas.io.parsers.base_parser import ParserBase | ||
|
||
|
||
class ArrowParserWrapper(ParserBase): | ||
|
@@ -77,7 +78,7 @@ def _get_pyarrow_options(self) -> None: | |
else self.kwds["skiprows"], | ||
} | ||
|
||
def _finalize_output(self, frame: DataFrame) -> DataFrame: | ||
def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: | ||
""" | ||
Processes data read in based on kwargs. | ||
|
||
|
@@ -150,6 +151,16 @@ def read(self) -> DataFrame: | |
parse_options=pyarrow_csv.ParseOptions(**self.parse_options), | ||
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), | ||
) | ||
|
||
frame = table.to_pandas() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know if the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The implementation here just directly converts arrow table -> arrow ChunkedArray and sticks it into pandas |
||
return self._finalize_output(frame) | ||
if ( | ||
self.kwds["use_nullable_dtypes"] | ||
and get_option("io.nullable_backend") == "pyarrow" | ||
): | ||
frame = DataFrame( | ||
{ | ||
col_name: arrays.ArrowExtensionArray(pa_col) | ||
for col_name, pa_col in zip(table.column_names, table.itercolumns()) | ||
} | ||
) | ||
else: | ||
frame = table.to_pandas() | ||
return self._finalize_pandas_output(frame) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,8 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._config import get_option | ||
|
||
from pandas._libs import lib | ||
from pandas._libs.parsers import STR_NA_VALUES | ||
from pandas._typing import ( | ||
|
@@ -600,6 +602,14 @@ def _read( | |
raise ValueError( | ||
"The 'chunksize' option is not supported with the 'pyarrow' engine" | ||
) | ||
elif ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was there a reason to disable this for the C/Python parsers? If not, I'm OK with allowing this for consistency purposes. Given the bugginess/lack of support for some kwargs in the pyarrow engine(Sorry for not really maintaining it after adding it), a user might want to read with the C/Python engine, first before operating on arrow arrays. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not entirely; I just wanted to keep the changes of this PR small for now and add support in a follow up PR |
||
kwds.get("use_nullable_dtypes", False) | ||
and get_option("io.nullable_backend") == "pyarrow" | ||
): | ||
raise NotImplementedError( | ||
f"use_nullable_dtypes=True and engine={kwds['engine']} with " | ||
"io.nullable_backend set to 'pyarrow' is not implemented." | ||
) | ||
else: | ||
chunksize = validate_integer("chunksize", chunksize, 1) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be worth having this a section for itself.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea. Moved to its own section