From 41a5048ed5b97f395abf302f43ef7d2e3ae0e8ad Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 11:23:02 +0100 Subject: [PATCH 1/4] Revert "CLN: Remove read_orc dtype checking (#51604)" This reverts commit b69bd0755d654c0dba42042df7b2251918274327. --- pandas/io/orc.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 3999fc5840f02..41948e14cdd68 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -21,6 +21,13 @@ ) from pandas.compat._optional import import_optional_dependency +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_interval_dtype, + is_period_dtype, + is_unsigned_integer_dtype, +) + from pandas.core.arrays import ArrowExtensionArray from pandas.core.frame import DataFrame @@ -201,10 +208,22 @@ def to_orc( if engine_kwargs is None: engine_kwargs = {} + # If unsupported dtypes are found raise NotImplementedError + # In Pyarrow 9.0.0 this check will no longer be needed + for dtype in df.dtypes: + if ( + is_categorical_dtype(dtype) + or is_interval_dtype(dtype) + or is_period_dtype(dtype) + or is_unsigned_integer_dtype(dtype) + ): + raise NotImplementedError( + "The dtype of one or more columns is not supported yet." + ) + if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") - pa = import_optional_dependency("pyarrow") orc = import_optional_dependency("pyarrow.orc") was_none = path is None @@ -219,7 +238,7 @@ def to_orc( handles.handle, **engine_kwargs, ) - except (TypeError, pa.ArrowNotImplementedError) as e: + except TypeError as e: raise NotImplementedError( "The dtype of one or more columns is not supported yet." ) from e From f101cde13721dcf08c693e22eba18079307de4cb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 15:13:46 +0100 Subject: [PATCH 2/4] Only check for 7.0 --- pandas/io/orc.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 41948e14cdd68..d10a345aab1d3 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -19,6 +19,7 @@ ReadBuffer, WriteBuffer, ) +from pandas.compat import pa_version_under8p0 from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import ( @@ -209,17 +210,18 @@ def to_orc( engine_kwargs = {} # If unsupported dtypes are found raise NotImplementedError - # In Pyarrow 9.0.0 this check will no longer be needed - for dtype in df.dtypes: - if ( - is_categorical_dtype(dtype) - or is_interval_dtype(dtype) - or is_period_dtype(dtype) - or is_unsigned_integer_dtype(dtype) - ): - raise NotImplementedError( - "The dtype of one or more columns is not supported yet." - ) + # In Pyarrow 8.0.0 this check will no longer be needed + if pa_version_under8p0: + for dtype in df.dtypes: + if ( + is_categorical_dtype(dtype) + or is_interval_dtype(dtype) + or is_period_dtype(dtype) + or is_unsigned_integer_dtype(dtype) + ): + raise NotImplementedError( + "The dtype of one or more columns is not supported yet." + ) if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") From 4166fcb8844517367ccd5f554a1a37b81f004137 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 15:45:13 +0100 Subject: [PATCH 3/4] FIx regex --- pandas/tests/io/test_orc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 9db19d4eb8448..0b4b5c807cbc0 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -301,7 +301,10 @@ def test_orc_roundtrip_bytesio(): def test_orc_writer_dtypes_not_supported(df_not_supported): # GH44554 # PyArrow gained ORC write support with the current argument order - msg = "The dtype of one or more columns is not supported yet." + msg = ( + "The dtype of one or more columns is not supported yet.|" + "Unknown or unsupported Arrow type" + ) with pytest.raises(NotImplementedError, match=msg): df_not_supported.to_orc() From 0862ce26fda32a4718f04d34c1fb7efe9cdf0e32 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 23:01:08 +0100 Subject: [PATCH 4/4] Update --- pandas/io/orc.py | 3 ++- pandas/tests/io/test_orc.py | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index d10a345aab1d3..1b9be9adc1196 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -226,6 +226,7 @@ def to_orc( if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") + pa = import_optional_dependency("pyarrow") orc = import_optional_dependency("pyarrow.orc") was_none = path is None @@ -240,7 +241,7 @@ def to_orc( handles.handle, **engine_kwargs, ) - except TypeError as e: + except (TypeError, pa.ArrowNotImplementedError) as e: raise NotImplementedError( "The dtype of one or more columns is not supported yet." ) from e diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 0b4b5c807cbc0..9db19d4eb8448 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -301,10 +301,7 @@ def test_orc_roundtrip_bytesio(): def test_orc_writer_dtypes_not_supported(df_not_supported): # GH44554 # PyArrow gained ORC write support with the current argument order - msg = ( - "The dtype of one or more columns is not supported yet.|" - "Unknown or unsupported Arrow type" - ) + msg = "The dtype of one or more columns is not supported yet." with pytest.raises(NotImplementedError, match=msg): df_not_supported.to_orc()