BUG/TST: fix arrow roundtrip / parquet tests for recent pyarrow (#30077)

jorisvandenbossche · web-flow · commit 4e807a292380 · 2019-12-18T17:56:41.000+01:00
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -103,6 +103,24 @@ def __repr__(self) -> str:
     def _is_boolean(self) -> bool:
         return True
 
+    def __from_arrow__(self, array):
+        """Construct BooleanArray from passed pyarrow Array/ChunkedArray"""
+        import pyarrow
+
+        if isinstance(array, pyarrow.Array):
+            chunks = [array]
+        else:
+            # pyarrow.ChunkedArray
+            chunks = array.chunks
+
+        results = []
+        for arr in chunks:
+            # TODO should optimize this without going through object array
+            bool_arr = BooleanArray._from_sequence(np.array(arr))
+            results.append(bool_arr)
+
+        return BooleanArray._concat_same_type(results)
+
 
 def coerce_to_array(values, mask=None, copy: bool = False):
     """
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -86,7 +86,7 @@ def __from_arrow__(self, array):
 
         results = []
         for arr in chunks:
-            # using _from_sequence to ensure None is convered to np.nan
+            # using _from_sequence to ensure None is convered to NA
             str_arr = StringArray._from_sequence(np.array(arr))
             results.append(str_arr)
 
@@ -208,7 +208,10 @@ def __arrow_array__(self, type=None):
 
         if type is None:
             type = pa.string()
-        return pa.array(self._ndarray, type=type, from_pandas=True)
+
+        values = self._ndarray.copy()
+        values[self.isna()] = None
+        return pa.array(values, type=type, from_pandas=True)
 
     def _values_for_factorize(self):
         arr = self._ndarray.copy()
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -235,5 +235,5 @@ def test_arrow_roundtrip():
     result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.StringDtype)
     tm.assert_frame_equal(result, df)
-    # ensure the missing value is represented by NaN and not None
-    assert np.isnan(result.loc[2, "a"])
+    # ensure the missing value is represented by NA and not np.nan or None
+    assert result.loc[2, "a"] is pd.NA
diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py
@@ -757,12 +757,29 @@ def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip):
 #         result = arr[mask]
 
 
-@pytest.mark.skip(reason="broken test")
 @td.skip_if_no("pyarrow", min_version="0.15.0")
 def test_arrow_array(data):
     # protocol added in 0.15.0
     import pyarrow as pa
 
     arr = pa.array(data)
-    expected = pa.array(np.array(data, dtype=object), type=pa.bool_(), from_pandas=True)
+
+    # TODO use to_numpy(na_value=None) here
+    data_object = np.array(data, dtype=object)
+    data_object[data.isna()] = None
+    expected = pa.array(data_object, type=pa.bool_(), from_pandas=True)
     assert arr.equals(expected)
+
+
+@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
+def test_arrow_roundtrip():
+    # roundtrip possible from arrow 1.0.0
+    import pyarrow as pa
+
+    data = pd.array([True, False, None], dtype="boolean")
+    df = pd.DataFrame({"a": data})
+    table = pa.table(df)
+    assert table.field("a").type == "bool"
+    result = table.to_pandas()
+    assert isinstance(result["a"].dtype, pd.BooleanDtype)
+    tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -525,7 +525,6 @@ def test_write_with_schema(self, pa):
         out_df = df.astype(bool)
         check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df)
 
-    @pytest.mark.skip(reason="broken test")
     @td.skip_if_no("pyarrow", min_version="0.15.0")
     def test_additional_extension_arrays(self, pa):
         # test additional ExtensionArrays that are supported through the