-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Fix some more arrow CSV tests #52087
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
d35ffd2
04857b3
fb08b13
eaff558
c0268d9
d7d5e32
dc44d91
b4014c1
671df0f
18450ed
3d3d5f3
32935b2
2b4aae7
43db279
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -80,6 +80,7 @@ def _get_pyarrow_options(self) -> None: | |
"decimal_point", | ||
) | ||
} | ||
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] | ||
self.read_options = { | ||
"autogenerate_column_names": self.header is None, | ||
"skip_rows": self.header | ||
|
@@ -149,6 +150,7 @@ def read(self) -> DataFrame: | |
DataFrame | ||
The DataFrame created from the CSV file. | ||
""" | ||
pa = import_optional_dependency("pyarrow") | ||
pyarrow_csv = import_optional_dependency("pyarrow.csv") | ||
self._get_pyarrow_options() | ||
|
||
|
@@ -158,9 +160,29 @@ def read(self) -> DataFrame: | |
parse_options=pyarrow_csv.ParseOptions(**self.parse_options), | ||
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), | ||
) | ||
if self.kwds["dtype_backend"] == "pyarrow": | ||
|
||
dtype_backend = self.kwds["dtype_backend"] | ||
|
||
# Convert all pa.null() cols -> float64 (non nullable) | ||
# else Int64 (nullable case) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was really confusing for me, but apparently even There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure we should replicate that here. But if you don't know the dtype you are starting with (like here), I think using float is safer (it won't give problems later on if you then want to use it for floats and not just integers). Or actually using object dtype might even be safest (as that can represent everything), but of course we don't (yet) have a nullable object dtype, so that's probably not really an option. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem is that the C and python parsers already convert to an Int64. Is this something that we need to fix before 2.0?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cc @phofl. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do this everywhere, so I am +1 on doing Int64 here as well. Initially this came from read_parquet I think |
||
# TODO: There has to be a better way... right? | ||
if dtype_backend != "pyarrow": | ||
new_schema = table.schema | ||
if dtype_backend == "numpy_nullable": | ||
new_type = pa.int64() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't you just path the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will need to wait for #52223 then. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LGTM, but yeah since #52223 is close lets wait for that one then merge this |
||
else: | ||
new_type = pa.float64() | ||
for i, arrow_type in enumerate(table.schema.types): | ||
if pa.types.is_null(arrow_type): | ||
new_schema = new_schema.set( | ||
i, new_schema.field(i).with_type(new_type) | ||
) | ||
|
||
table = table.cast(new_schema) | ||
|
||
if dtype_backend == "pyarrow": | ||
frame = table.to_pandas(types_mapper=pd.ArrowDtype) | ||
elif self.kwds["dtype_backend"] == "numpy_nullable": | ||
elif dtype_backend == "numpy_nullable": | ||
frame = table.to_pandas(types_mapper=_arrow_dtype_mapping().get) | ||
else: | ||
frame = table.to_pandas() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -423,13 +423,9 @@ def test_dtype_backend(all_parsers): | |
"e": pd.Series([pd.NA, 6], dtype="Int64"), | ||
"f": pd.Series([pd.NA, 7.5], dtype="Float64"), | ||
"g": pd.Series([pd.NA, True], dtype="boolean"), | ||
"h": pd.Series( | ||
[pd.NA if parser.engine != "pyarrow" else "", "a"], dtype="string" | ||
), | ||
"h": pd.Series([pd.NA, "a"], dtype="string"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thx for these |
||
"i": pd.Series([Timestamp("2019-12-31")] * 2), | ||
"j": pd.Series( | ||
[pd.NA, pd.NA], dtype="Int64" if parser.engine != "pyarrow" else object | ||
), | ||
"j": pd.Series([pd.NA, pd.NA], dtype="Int64"), | ||
} | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
@@ -451,7 +447,6 @@ def test_dtype_backend_and_dtype(all_parsers): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
@pytest.mark.usefixtures("pyarrow_xfail") | ||
def test_dtype_backend_string(all_parsers, string_storage): | ||
# GH#36712 | ||
pa = pytest.importorskip("pyarrow") | ||
|
@@ -499,7 +494,6 @@ def test_dtype_backend_pyarrow(all_parsers, request): | |
# GH#36712 | ||
pa = pytest.importorskip("pyarrow") | ||
parser = all_parsers | ||
engine = parser.engine | ||
|
||
data = """a,b,c,d,e,f,g,h,i,j | ||
1,2.5,True,a,,,,,12-31-2019, | ||
|
@@ -516,7 +510,7 @@ def test_dtype_backend_pyarrow(all_parsers, request): | |
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"), | ||
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"), | ||
"h": pd.Series( | ||
[pd.NA if engine != "pyarrow" else "", "a"], | ||
[pd.NA, "a"], | ||
dtype=pd.ArrowDtype(pa.string()), | ||
), | ||
"i": pd.Series([Timestamp("2019-12-31")] * 2), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In what way does it break parquet?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, first I hit this error.
After patching that(another PR incoming),
now an empty DataFrame of dtype object(
pd.DataFrame({"value": pd.array([], dtype=object)})
) returns a Float64Dtype when roundtripped.Best guess here is that somehow the
types_mapper
is overriding the pandas metadata somehow.