Backport fastparquet 0.7 compat (PR pandas-dev#42954 and pandas-dev#42919) (pandas-dev#42987)

lithomas1 · web-flow · commit 255b796008a5 · 2021-08-12T08:29:06.000+02:00
diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml
@@ -15,7 +15,7 @@ dependencies:
   - beautifulsoup4
   - botocore>=1.11
   - dask
-  - fastparquet>=0.4.0, < 0.7.0
+  - fastparquet>=0.4.0
   - fsspec>=0.7.4, <2021.6.0
   - gcsfs>=0.6.0
   - geopandas
@@ -25,7 +25,7 @@ dependencies:
   - flask
   - nomkl
   - numexpr
-  - numpy=1.17.*
+  - numpy=1.18.*
   - odfpy
   - openpyxl
   - pandas-gbq
diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -15,7 +15,7 @@ dependencies:
   # pandas dependencies
   - blosc
   - bottleneck
-  - fastparquet>=0.4.0, <0.7.0
+  - fastparquet>=0.4.0
   - flask
   - fsspec>=0.8.0, <2021.6.0
   - matplotlib=3.1.3
diff --git a/environment.yml b/environment.yml
@@ -99,7 +99,7 @@ dependencies:
   - xlwt
   - odfpy
 
-  - fastparquet>=0.3.2, <0.7.0  # pandas.read_parquet, DataFrame.to_parquet
+  - fastparquet>=0.3.2  # pandas.read_parquet, DataFrame.to_parquet
   - pyarrow>=0.17.0  # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
   - python-snappy  # required by pyarrow
 
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -309,14 +309,17 @@ def write(
     def read(
         self, path, columns=None, storage_options: StorageOptions = None, **kwargs
     ):
+        parquet_kwargs: dict[str, Any] = {}
         use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
+        if Version(self.api.__version__) >= Version("0.7.1"):
+            # We are disabling nullable dtypes for fastparquet pending discussion
+            parquet_kwargs["pandas_nulls"] = False
         if use_nullable_dtypes:
             raise ValueError(
                 "The 'use_nullable_dtypes' argument is not supported for the "
                 "fastparquet engine"
             )
         path = stringify_path(path)
-        parquet_kwargs = {}
         handles = None
         if is_fsspec_url(path):
             fsspec = import_optional_dependency("fsspec")
@@ -337,6 +340,7 @@ def read(
                 path, "rb", is_text=False, storage_options=storage_options
             )
             path = handles.handle
+
         parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
 
         result = parquet_file.to_pandas(columns=columns, **kwargs)
@@ -470,7 +474,8 @@ def read_parquet(
 
     use_nullable_dtypes : bool, default False
         If True, use dtypes that use ``pd.NA`` as missing value indicator
-        for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
+        for the resulting DataFrame. (only applicable for the ``pyarrow``
+        engine)
         As new dtypes are added that support ``pd.NA`` in the future, the
         output with this option will change to use those dtypes.
         Note: this is an experimental option, and behaviour (e.g. additional
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa):
         msg = r"parquet must have string column names"
         self.check_error_on_write(df, engine, ValueError, msg)
 
+    def test_use_nullable_dtypes(self, engine):
+        import pyarrow.parquet as pq
+
+        if engine == "fastparquet":
+            # We are manually disabling fastparquet's
+            # nullable dtype support pending discussion
+            pytest.skip("Fastparquet nullable dtype support is disabled")
+
+        table = pyarrow.table(
+            {
+                "a": pyarrow.array([1, 2, 3, None], "int64"),
+                "b": pyarrow.array([1, 2, 3, None], "uint8"),
+                "c": pyarrow.array(["a", "b", "c", None]),
+                "d": pyarrow.array([True, False, True, None]),
+                # Test that nullable dtypes used even in absence of nulls
+                "e": pyarrow.array([1, 2, 3, 4], "int64"),
+            }
+        )
+        with tm.ensure_clean() as path:
+            # write manually with pyarrow to write integers
+            pq.write_table(table, path)
+            result1 = read_parquet(path, engine=engine)
+            result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)
+
+        assert result1["a"].dtype == np.dtype("float64")
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([1, 2, 3, None], dtype="Int64"),
+                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
+                "c": pd.array(["a", "b", "c", None], dtype="string"),
+                "d": pd.array([True, False, True, None], dtype="boolean"),
+                "e": pd.array([1, 2, 3, 4], dtype="Int64"),
+            }
+        )
+        if engine == "fastparquet":
+            # Fastparquet doesn't support string columns yet
+            # Only int and boolean
+            result2 = result2.drop("c", axis=1)
+            expected = expected.drop("c", axis=1)
+        tm.assert_frame_equal(result2, expected)
+
 
 @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
 class TestParquetPyArrow(Base):
@@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa):
         )
         check_round_trip(df, pa)
 
-    @td.skip_if_no("pyarrow")
-    def test_use_nullable_dtypes(self, pa):
-        import pyarrow.parquet as pq
-
-        table = pyarrow.table(
-            {
-                "a": pyarrow.array([1, 2, 3, None], "int64"),
-                "b": pyarrow.array([1, 2, 3, None], "uint8"),
-                "c": pyarrow.array(["a", "b", "c", None]),
-                "d": pyarrow.array([True, False, True, None]),
-            }
-        )
-        with tm.ensure_clean() as path:
-            # write manually with pyarrow to write integers
-            pq.write_table(table, path)
-            result1 = read_parquet(path)
-            result2 = read_parquet(path, use_nullable_dtypes=True)
-
-        assert result1["a"].dtype == np.dtype("float64")
-        expected = pd.DataFrame(
-            {
-                "a": pd.array([1, 2, 3, None], dtype="Int64"),
-                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
-                "c": pd.array(["a", "b", "c", None], dtype="string"),
-                "d": pd.array([True, False, True, None], dtype="boolean"),
-            }
-        )
-        tm.assert_frame_equal(result2, expected)
-
     def test_timestamp_nanoseconds(self, pa):
         # with version 2.0, pyarrow defaults to writing the nanoseconds, so
         # this should work without error
@@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp):
     def test_bool_with_none(self, fp):
         df = pd.DataFrame({"a": [True, None, False]})
         expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
-        check_round_trip(df, fp, expected=expected)
+        # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
+        # float64
+        check_round_trip(df, fp, expected=expected, check_dtype=False)
 
     def test_unsupported(self, fp):
 
@@ -1049,9 +1063,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
         expected.index.name = "index"
         check_round_trip(df, fp, expected=expected)
 
-    def test_use_nullable_dtypes_not_supported(self, fp):
+    def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
         df = pd.DataFrame({"a": [1, 2]})
 
+        # This is supported now in fastparquet 0.7.1 and above actually
+        # Still need to ensure that this raises in all versions below
+        import fastparquet as fp
+
+        monkeypatch.setattr(fp, "__version__", "0.4")
         with tm.ensure_clean() as path:
             df.to_parquet(path)
             with pytest.raises(ValueError, match="not supported for the fastparquet"):
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -64,7 +64,7 @@ xlrd
 xlsxwriter
 xlwt
 odfpy
-fastparquet>=0.3.2, <0.7.0
+fastparquet>=0.3.2
 pyarrow>=0.17.0
 python-snappy
 pyqt5>=5.9.2