From 0963f7a5620ddb971675f208409fed0db7b36d46 Mon Sep 17 00:00:00 2001 From: anirudhsekar96 Date: Wed, 19 Jan 2022 15:55:13 -0800 Subject: [PATCH 1/4] closes #44914 by changing the path object to string if it is of io.BufferedWriter type. --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parquet.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 1ca4e8cc97df0..3222f6292ac46 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -935,6 +935,7 @@ I/O - Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`) - Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`) - Bug in :func:`read_csv` an invalid value of ``usecols`` leading to an un-closed file handle (:issue:`45384`) +- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) Period ^^^^^^ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4880c7730ff07..2bdc514bb09b2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -180,6 +180,9 @@ def write( mode="wb", is_dir=partition_cols is not None, ) + if isinstance(path_or_handle, WriteBuffer): + path_or_handle = path_or_handle.raw.name + try: if partition_cols is not None: # writes to multiple files under the given path From ee8cd90cfe9527f7714b4f114b7aefcdf7a4c2df Mon Sep 17 00:00:00 2001 From: anirudhsekar96 Date: Wed, 19 Jan 2022 16:30:36 -0800 Subject: [PATCH 2/4] #44914 Changes WriteBuffer class to io.BufferedWriter class for typecasting to string in path object for to_parquet --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 2bdc514bb09b2..c5bfbd2b6b35d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -180,7 +180,7 @@ def write( mode="wb", is_dir=partition_cols is not None, ) - if isinstance(path_or_handle, WriteBuffer): + if isinstance(path_or_handle, io.BufferedWriter): path_or_handle = path_or_handle.raw.name try: From 721585bfd983ec67cc482af8a78c1cf7fba7c034 Mon Sep 17 00:00:00 2001 From: anirudhsekar96 Date: Fri, 21 Jan 2022 09:19:35 -0800 Subject: [PATCH 3/4] Added tests for pyarrow unsupported columns --- pandas/tests/io/test_parquet.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index decdf02dd3072..8116e731628a3 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -728,6 +728,27 @@ def test_unsupported(self, pa): # older pyarrows raise ArrowInvalid self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + def test_unsupported_float16(self, pa): + # #44847, #44914 + # Not able to write float 16 column using pyarrow. + data = np.arange(2, 10, dtype=np.float16) + df = pd.DataFrame(data=data, columns=["fp16"]) + self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + + @pytest.mark.parametrize("path_type", [str, pathlib.Path]) + def test_unsupported_float16_cleanup(self, pa, path_type): + # #44847, #44914 + # Not able to write float 16 column using pyarrow. + # Tests cleanup by pyarrow in case of an error + data = np.arange(2, 10, dtype=np.float16) + df = pd.DataFrame(data=data, columns=["fp16"]) + + with tm.ensure_clean() as path_str: + path = path_type(path_str) + with tm.external_error_raised(pyarrow.ArrowException): + df.to_parquet(path=path, engine=pa) + assert not os.path.isfile(path) + def test_categorical(self, pa): # supported in >= 0.7.0 From 2327294f848e61e50d609728deb698abd0e7c137 Mon Sep 17 00:00:00 2001 From: anirudhsekar96 Date: Fri, 21 Jan 2022 16:56:08 -0800 Subject: [PATCH 4/4] Moved change log to v1.5.0 --- doc/source/whatsnew/v1.4.0.rst | 1 - doc/source/whatsnew/v1.5.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c77cff38f46d9..4e5369072e116 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -967,7 +967,6 @@ I/O - Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`) - Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`) - Bug in :func:`read_csv` an invalid value of ``usecols`` leading to an unclosed file handle (:issue:`45384`) -- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) Period ^^^^^^ diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1ae76984484af..8abd5a6e59995 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -227,6 +227,7 @@ MultiIndex I/O ^^^ - Bug in :meth:`DataFrame.to_stata` where no error is raised if the :class:`DataFrame` contains ``-np.inf`` (:issue:`45350`) +- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Period