From 9c9bf7b51bfdf3d17ff04af95c3749c8befc11f1 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 8 Aug 2021 16:09:18 -0700
Subject: [PATCH 1/4] Backport PR #42919: COMPAT: Support fastparquet 0.7.1

---
 ci/deps/actions-37-db.yaml      |  2 +-
 ci/deps/azure-windows-38.yaml   |  2 +-
 doc/source/whatsnew/v1.3.2.rst  |  2 +-
 environment.yml                 |  2 +-
 pandas/io/parquet.py            | 26 ++++++++---
 pandas/tests/io/test_parquet.py | 80 ++++++++++++++++++++-------------
 requirements-dev.txt            |  2 +-
 7 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml
index cfdcf266236e6..a9e4113bf9d18 100644
--- a/ci/deps/actions-37-db.yaml
+++ b/ci/deps/actions-37-db.yaml
@@ -15,7 +15,7 @@ dependencies:
   - beautifulsoup4
   - botocore>=1.11
   - dask
-  - fastparquet>=0.4.0, < 0.7.0
+  - fastparquet>=0.4.0
   - fsspec>=0.7.4, <2021.6.0
   - gcsfs>=0.6.0
   - geopandas
diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
index 902daf102ccda..70aa46e8a5851 100644
--- a/ci/deps/azure-windows-38.yaml
+++ b/ci/deps/azure-windows-38.yaml
@@ -15,7 +15,7 @@ dependencies:
   # pandas dependencies
   - blosc
   - bottleneck
-  - fastparquet>=0.4.0, <0.7.0
+  - fastparquet>=0.4.0
   - flask
   - fsspec>=0.8.0, <2021.6.0
   - matplotlib=3.1.3
diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
index 669e824fa3989..b0ad5cd506fce 100644
--- a/doc/source/whatsnew/v1.3.2.rst
+++ b/doc/source/whatsnew/v1.3.2.rst
@@ -47,7 +47,7 @@ Bug fixes
 
 Other
 ~~~~~
--
+- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1.
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/environment.yml b/environment.yml
index 20b7272e12ebb..e75e56238205b 100644
--- a/environment.yml
+++ b/environment.yml
@@ -99,7 +99,7 @@ dependencies:
   - xlwt
   - odfpy
 
-  - fastparquet>=0.3.2, <0.7.0  # pandas.read_parquet, DataFrame.to_parquet
+  - fastparquet>=0.3.2  # pandas.read_parquet, DataFrame.to_parquet
   - pyarrow>=0.17.0  # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
   - python-snappy  # required by pyarrow
 
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index b7523fada07d0..f0aeeb3e6c893 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -309,14 +309,21 @@ def write(
     def read(
         self, path, columns=None, storage_options: StorageOptions = None, **kwargs
     ):
+        parquet_kwargs = {}
         use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
-        if use_nullable_dtypes:
-            raise ValueError(
-                "The 'use_nullable_dtypes' argument is not supported for the "
-                "fastparquet engine"
-            )
+        # Technically works with 0.7.0, but was incorrect
+        # so lets just require 0.7.1
+        if Version(self.api.__version__) >= Version("0.7.1"):
+            # Need to set even for use_nullable_dtypes = False,
+            # since our defaults differ
+            parquet_kwargs["pandas_nulls"] = use_nullable_dtypes
+        else:
+            if use_nullable_dtypes:
+                raise ValueError(
+                    "The 'use_nullable_dtypes' argument is not supported for the "
+                    "fastparquet engine for fastparquet versions less than 0.7.1"
+                )
         path = stringify_path(path)
-        parquet_kwargs = {}
         handles = None
         if is_fsspec_url(path):
             fsspec = import_optional_dependency("fsspec")
@@ -337,6 +344,7 @@ def read(
                 path, "rb", is_text=False, storage_options=storage_options
             )
             path = handles.handle
+
         parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
 
         result = parquet_file.to_pandas(columns=columns, **kwargs)
@@ -470,7 +478,7 @@ def read_parquet(
 
     use_nullable_dtypes : bool, default False
         If True, use dtypes that use ``pd.NA`` as missing value indicator
-        for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
+        for the resulting DataFrame.
         As new dtypes are added that support ``pd.NA`` in the future, the
         output with this option will change to use those dtypes.
         Note: this is an experimental option, and behaviour (e.g. additional
@@ -478,6 +486,10 @@ def read_parquet(
 
         .. versionadded:: 1.2.0
 
+        .. versionchanged:: 1.3.2
+            ``use_nullable_dtypes`` now works with the the ``fastparquet`` engine
+            if ``fastparquet`` is version 0.7.1 or higher.
+
     **kwargs
         Any additional kwargs are passed to the engine.
 
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index d100c584b698a..b951e92c0fa9c 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -575,6 +575,46 @@ def test_write_column_index_nonstring(self, pa):
         msg = r"parquet must have string column names"
         self.check_error_on_write(df, engine, ValueError, msg)
 
+    def test_use_nullable_dtypes(self, engine):
+        import pyarrow.parquet as pq
+
+        if engine == "fastparquet":
+            pytest.importorskip(
+                "fastparquet",
+                "0.7.1",
+                reason="fastparquet must be 0.7.1 or higher for nullable dtype support",
+            )
+
+        table = pyarrow.table(
+            {
+                "a": pyarrow.array([1, 2, 3, None], "int64"),
+                "b": pyarrow.array([1, 2, 3, None], "uint8"),
+                "c": pyarrow.array(["a", "b", "c", None]),
+                "d": pyarrow.array([True, False, True, None]),
+            }
+        )
+        with tm.ensure_clean() as path:
+            # write manually with pyarrow to write integers
+            pq.write_table(table, path)
+            result1 = read_parquet(path, engine=engine)
+            result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)
+
+        assert result1["a"].dtype == np.dtype("float64")
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([1, 2, 3, None], dtype="Int64"),
+                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
+                "c": pd.array(["a", "b", "c", None], dtype="string"),
+                "d": pd.array([True, False, True, None], dtype="boolean"),
+            }
+        )
+        if engine == "fastparquet":
+            # Fastparquet doesn't support string columns yet
+            # Only int and boolean
+            result2 = result2.drop("c", axis=1)
+            expected = expected.drop("c", axis=1)
+        tm.assert_frame_equal(result2, expected)
+
 
 @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
 class TestParquetPyArrow(Base):
@@ -829,35 +869,6 @@ def test_additional_extension_types(self, pa):
         )
         check_round_trip(df, pa)
 
-    @td.skip_if_no("pyarrow")
-    def test_use_nullable_dtypes(self, pa):
-        import pyarrow.parquet as pq
-
-        table = pyarrow.table(
-            {
-                "a": pyarrow.array([1, 2, 3, None], "int64"),
-                "b": pyarrow.array([1, 2, 3, None], "uint8"),
-                "c": pyarrow.array(["a", "b", "c", None]),
-                "d": pyarrow.array([True, False, True, None]),
-            }
-        )
-        with tm.ensure_clean() as path:
-            # write manually with pyarrow to write integers
-            pq.write_table(table, path)
-            result1 = read_parquet(path)
-            result2 = read_parquet(path, use_nullable_dtypes=True)
-
-        assert result1["a"].dtype == np.dtype("float64")
-        expected = pd.DataFrame(
-            {
-                "a": pd.array([1, 2, 3, None], dtype="Int64"),
-                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
-                "c": pd.array(["a", "b", "c", None], dtype="string"),
-                "d": pd.array([True, False, True, None], dtype="boolean"),
-            }
-        )
-        tm.assert_frame_equal(result2, expected)
-
     def test_timestamp_nanoseconds(self, pa):
         # with version 2.0, pyarrow defaults to writing the nanoseconds, so
         # this should work without error
@@ -928,7 +939,9 @@ def test_duplicate_columns(self, fp):
     def test_bool_with_none(self, fp):
         df = pd.DataFrame({"a": [True, None, False]})
         expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
-        check_round_trip(df, fp, expected=expected)
+        # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
+        # float64
+        check_round_trip(df, fp, expected=expected, check_dtype=False)
 
     def test_unsupported(self, fp):
 
@@ -1049,9 +1062,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
         expected.index.name = "index"
         check_round_trip(df, fp, expected=expected)
 
-    def test_use_nullable_dtypes_not_supported(self, fp):
+    def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
         df = pd.DataFrame({"a": [1, 2]})
 
+        # This is supported now in fastparquet 0.7.1 and above actually
+        # Still need to ensure that this raises in all versions below
+        import fastparquet as fp
+
+        monkeypatch.setattr(fp, "__version__", "0.4")
         with tm.ensure_clean() as path:
             df.to_parquet(path)
             with pytest.raises(ValueError, match="not supported for the fastparquet"):
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 25ec5e1904d18..3b40c9c300ace 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -64,7 +64,7 @@ xlrd
 xlsxwriter
 xlwt
 odfpy
-fastparquet>=0.3.2, <0.7.0
+fastparquet>=0.3.2
 pyarrow>=0.17.0
 python-snappy
 pyqt5>=5.9.2

From 16a2238c3bfd4c652059a6392fc71381251bf498 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 10 Aug 2021 13:03:56 -0700
Subject: [PATCH 2/4] Revert fastparquet nullable dtype support (#42954)

---
 doc/source/whatsnew/v1.3.2.rst  |  1 -
 pandas/io/parquet.py            | 27 ++++++++++-----------------
 pandas/tests/io/test_parquet.py | 11 ++++++-----
 3 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
index b0ad5cd506fce..ef8f8245c6640 100644
--- a/doc/source/whatsnew/v1.3.2.rst
+++ b/doc/source/whatsnew/v1.3.2.rst
@@ -47,7 +47,6 @@ Bug fixes
 
 Other
 ~~~~~
-- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1.
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index f0aeeb3e6c893..49384cfb2e554 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -309,20 +309,16 @@ def write(
     def read(
         self, path, columns=None, storage_options: StorageOptions = None, **kwargs
     ):
-        parquet_kwargs = {}
+        parquet_kwargs: dict[str, Any] = {}
         use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
-        # Technically works with 0.7.0, but was incorrect
-        # so lets just require 0.7.1
         if Version(self.api.__version__) >= Version("0.7.1"):
-            # Need to set even for use_nullable_dtypes = False,
-            # since our defaults differ
-            parquet_kwargs["pandas_nulls"] = use_nullable_dtypes
-        else:
-            if use_nullable_dtypes:
-                raise ValueError(
-                    "The 'use_nullable_dtypes' argument is not supported for the "
-                    "fastparquet engine for fastparquet versions less than 0.7.1"
-                )
+            # We are disabling nullable dtypes for fastparquet pending discussion
+            parquet_kwargs["pandas_nulls"] = False
+        if use_nullable_dtypes:
+            raise ValueError(
+                "The 'use_nullable_dtypes' argument is not supported for the "
+                "fastparquet engine"
+            )
         path = stringify_path(path)
         handles = None
         if is_fsspec_url(path):
@@ -478,7 +474,8 @@ def read_parquet(
 
     use_nullable_dtypes : bool, default False
         If True, use dtypes that use ``pd.NA`` as missing value indicator
-        for the resulting DataFrame.
+        for the resulting DataFrame. (only applicable for the ``pyarrow``
+        engine)
         As new dtypes are added that support ``pd.NA`` in the future, the
         output with this option will change to use those dtypes.
         Note: this is an experimental option, and behaviour (e.g. additional
@@ -486,10 +483,6 @@ def read_parquet(
 
         .. versionadded:: 1.2.0
 
-        .. versionchanged:: 1.3.2
-            ``use_nullable_dtypes`` now works with the the ``fastparquet`` engine
-            if ``fastparquet`` is version 0.7.1 or higher.
-
     **kwargs
         Any additional kwargs are passed to the engine.
 
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index b951e92c0fa9c..c0e4cde0f01f8 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -579,11 +579,9 @@ def test_use_nullable_dtypes(self, engine):
         import pyarrow.parquet as pq
 
         if engine == "fastparquet":
-            pytest.importorskip(
-                "fastparquet",
-                "0.7.1",
-                reason="fastparquet must be 0.7.1 or higher for nullable dtype support",
-            )
+            # We are manually disabling fastparquet's
+            # nullable dtype support pending discussion
+            pytest.skip("Fastparquet nullable dtype support is disabled")
 
         table = pyarrow.table(
             {
@@ -591,6 +589,8 @@ def test_use_nullable_dtypes(self, engine):
                 "b": pyarrow.array([1, 2, 3, None], "uint8"),
                 "c": pyarrow.array(["a", "b", "c", None]),
                 "d": pyarrow.array([True, False, True, None]),
+                # Test that nullable dtypes used even in absence of nulls
+                "e": pyarrow.array([1, 2, 3, 4], "int64"),
             }
         )
         with tm.ensure_clean() as path:
@@ -606,6 +606,7 @@ def test_use_nullable_dtypes(self, engine):
                 "b": pd.array([1, 2, 3, None], dtype="UInt8"),
                 "c": pd.array(["a", "b", "c", None], dtype="string"),
                 "d": pd.array([True, False, True, None], dtype="boolean"),
+                "e": pd.array([1, 2, 3, 4], dtype="Int64"),
             }
         )
         if engine == "fastparquet":

From 2dca892a7395d0362f5d1ab92f308c727c0a5d64 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 11 Aug 2021 11:54:14 -0700
Subject: [PATCH 3/4] Revert doc changes

---
 doc/source/whatsnew/v1.3.2.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
index ef8f8245c6640..669e824fa3989 100644
--- a/doc/source/whatsnew/v1.3.2.rst
+++ b/doc/source/whatsnew/v1.3.2.rst
@@ -48,6 +48,7 @@ Bug fixes
 Other
 ~~~~~
 -
+-
 
 .. ---------------------------------------------------------------------------
 

From 1080e1187e0ce2e1c1817803a853a8b06eeb12e1 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 11 Aug 2021 16:22:54 -0700
Subject: [PATCH 4/4] upgrade numpy?

---
 ci/deps/actions-37-db.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml
index a9e4113bf9d18..9d680cb8338fd 100644
--- a/ci/deps/actions-37-db.yaml
+++ b/ci/deps/actions-37-db.yaml
@@ -25,7 +25,7 @@ dependencies:
   - flask
   - nomkl
   - numexpr
-  - numpy=1.17.*
+  - numpy=1.18.*
   - odfpy
   - openpyxl
   - pandas-gbq