-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add io.nullable_backend=pyarrow support to read_excel #49965
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
d5c82b6
b9cb462
b5d87aa
434418a
a106949
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -536,7 +536,11 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): | |
actual = pd.read_excel(basename + read_ext, dtype=dtype) | ||
tm.assert_frame_equal(actual, expected) | ||
|
||
def test_use_nullable_dtypes(self, read_ext): | ||
@pytest.mark.parametrize( | ||
"nullable_backend", | ||
["pandas", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], | ||
) | ||
def test_use_nullable_dtypes(self, read_ext, nullable_backend): | ||
# GH#36712 | ||
if read_ext in (".xlsb", ".xls"): | ||
pytest.skip(f"No engine for filetype: '{read_ext}'") | ||
|
@@ -557,10 +561,30 @@ def test_use_nullable_dtypes(self, read_ext): | |
) | ||
with tm.ensure_clean(read_ext) as file_path: | ||
df.to_excel(file_path, "test", index=False) | ||
result = pd.read_excel( | ||
file_path, sheet_name="test", use_nullable_dtypes=True | ||
with pd.option_context("io.nullable_backend", nullable_backend): | ||
result = pd.read_excel( | ||
file_path, sheet_name="test", use_nullable_dtypes=True | ||
) | ||
if nullable_backend == "pyarrow": | ||
import pyarrow as pa | ||
|
||
from pandas.arrays import ArrowExtensionArray | ||
|
||
expected = DataFrame( | ||
{ | ||
col: ArrowExtensionArray(pa.array(df[col], from_pandas=True)) | ||
for col in df.columns | ||
} | ||
) | ||
tm.assert_frame_equal(result, df) | ||
# pyarrow by default infers timestamp resolution as us, not ns | ||
expected["i"] = ArrowExtensionArray( | ||
expected["i"].array._data.cast(pa.timestamp(unit="us")) | ||
) | ||
# pyarrow supports a null type, so don't have to default to Int64 | ||
expected["j"] = ArrowExtensionArray(pa.array([None, None])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @phofl I noticed that in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, if you do:
you get back Int64 for all columns. Not saying this is perfect, but it made sense to model it after the existing behavior. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah okay makes sense why this is the case then |
||
else: | ||
expected = df | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_use_nullabla_dtypes_and_dtype(self, read_ext): | ||
# GH#36712 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cc @phofl this is the section where we can expand on
use_nullable_dtype
. Let me know if I should expand on anything in this PRThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking about adding a really short example with only one or 2 columns for read_csv. To show a bit better how to use it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should I expand on the example below (it's hidden in the diff) to show
pd.option_context("io.nullable_backend", "pandas")
as well?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah sorry, missed that. Yes a pandas example would be great.
Another thing: Maybe make the functions bullet points, e.g.
So that they become a bit more prominent?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea with the bullet points. Also added an example with the pandas example