diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 916bcf3db9a4a..7136bda2ea865 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -754,6 +754,7 @@ I/O - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) - Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`) - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) +- Bug in :func:`to_parquet` where ``engine="pyarrow"`` could result in partial write when column dtype is ``float16``. (:issue:`44846) - Period diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 56131d000b176..fcb6a77eac02e 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -167,6 +167,13 @@ def write( ): self.validate_dataframe(df) + fp16_columns = df.select_dtypes(include="float16").columns + if fp16_columns.size > 0: + raise ValueError( + f"Columns [{','.join(fp16_columns.values)}] are of dtype float16. " + + "PyArrow does not support saving float16 dtype columns." + ) + from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)} if index is not None: from_pandas_kwargs["preserve_index"] = index diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 0bd291cea894e..1ed5d48711495 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -715,6 +715,18 @@ def test_duplicate_columns(self, pa): df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, pa, ValueError, "Duplicate column names found") + def test_unsupported_float16(self, pa): + # #44847 + # Not able to write float 16 column using pyarrow. + data = np.arange(2, 10, dtype=np.float16) + df = pd.DataFrame(data=data, columns=["fp16"]) + fp16_columns = fp16_columns = df.select_dtypes(include="float16").columns + msg = ( + f"Columns \\[{','.join(fp16_columns.values)}\\] are of dtype float16. " + + "PyArrow does not support saving float16 dtype columns." + ) + self.check_error_on_write(df, pa, ValueError, msg) + def test_unsupported(self, pa): # timedelta df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)})