From 9a7b29ae7a2945ee07fcd956f6a54bd1089de6a7 Mon Sep 17 00:00:00 2001 From: NickFillot <40593450+NickFillot@users.noreply.github.com> Date: Sun, 3 Oct 2021 16:23:02 +0200 Subject: [PATCH 01/49] [ENH] to_orc pandas.io.orc.to_orc method definition --- pandas/io/orc.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index b02660c089382..06d9563aa080f 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,7 +1,9 @@ """ orc compat """ from __future__ import annotations +import os from typing import TYPE_CHECKING +from tempfile import gettempdir from pandas._typing import ( FilePath, @@ -11,8 +13,10 @@ from pandas.io.common import get_handle -if TYPE_CHECKING: - from pandas import DataFrame +from pandas.core import generic +from pandas.util._decorators import doc + +from pandas import DataFrame def read_orc( @@ -52,3 +56,78 @@ def read_orc( with get_handle(path, "rb", is_text=False) as handles: orc_file = orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() + + +def to_orc( + df: DataFrame, + path: FilePathOrBuffer = None, + engine: str = 'pyarrow', + index: bool = None, + **kwargs +) -> bytes: + """ + Write a DataFrame to the orc/arrow format. + Parameters + ---------- + df : DataFrame + path : str or file-like object, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. + engine : {{'pyarrow'}}, default 'pyarrow' + Parquet library to use, or library it self, checked with 'pyarrow' name + and version > 4.0.0 + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + kwargs + Additional keyword arguments passed to the engine + Returns + ------- + bytes if no path argument is provided else None + """ + if index is None: + index = df.index.names[0] is not None + + if isinstance(engine, str): + engine = import_optional_dependency(engine, min_version='4.0.0') + else: + try: + assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" + assert hasattr(engine, 'orc'), "'pyarrow' module must have version > 4.0.0 with orc module" + except Exception as e: + raise ValueError("Wrong engine passed, %s" % ( + e, + )) + + if path is None: + # to bytes: tmp path, pyarrow auto closes buffers + path = os.path.join(gettempdir(), os.urandom(12).hex()) + try: + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + path, **kwargs + ) + with open(path, 'rb') as path: + return path.read() + except BaseException as e: + raise e + finally: + try: + os.remove(path) + except Exception as e: + pass + else: + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + path, **kwargs + ) + return From d11026f0a310a24a09f0357407835a03ccd2a7bf Mon Sep 17 00:00:00 2001 From: NickFillot <40593450+NickFillot@users.noreply.github.com> Date: Sun, 3 Oct 2021 16:34:37 +0200 Subject: [PATCH 02/49] pandas.DataFrame.to_orc set to_orc to pandas.DataFrame --- pandas/core/frame.py | 74 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 54ee5ed2f35d1..694cfdf9f8e82 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2900,7 +2900,81 @@ def to_parquet( storage_options=storage_options, **kwargs, ) + + def to_orc( + self, + path: FilePathOrBuffer = None, + engine: str = 'pyarrow', + index: bool = None, + **kwargs + ) -> bytes: + """ + Write a DataFrame to the orc/arrow format. + Parameters + ---------- + df : DataFrame + path : str or file-like object, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. + engine : {{'pyarrow'}}, default 'pyarrow' + Parquet library to use, or library it self, checked with 'pyarrow' name + and version > 4.0.0 + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + kwargs + Additional keyword arguments passed to the engine + Returns + ------- + bytes if no path argument is provided else None + + See Also + -------- + read_orc : Read a ORC file. + DataFrame.to_parquet : Write a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + Notes + ----- + This function requires `pyarrow `_ library. + + Examples + -------- + >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df.to_orc('df.orc', compression='gzip') # doctest: +SKIP + >>> pd.read_orc('df.orc') # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + + If you want to get a buffer to the orc content you can write it to io.BytesIO + >>> import io + >>> b = io.BytesIO(df.to_orc()) + >>> b.seek(0) + 0 + >>> content = b.read() + """ + from pandas.io.orc import to_orc + + return to_orc( + self, + path, + engine, + index=index, + **kwargs + ) + @Substitution( header_type="bool", header="Whether to print column labels, default True", From 0146ac3aea9f87cb0e053af0784a17efd8230e3f Mon Sep 17 00:00:00 2001 From: NickFillot <40593450+NickFillot@users.noreply.github.com> Date: Sun, 3 Oct 2021 16:47:11 +0200 Subject: [PATCH 03/49] Cleaning --- pandas/io/orc.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 06d9563aa080f..15161ae202ad3 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -2,6 +2,8 @@ from __future__ import annotations import os +import pandas._testing as tm + from typing import TYPE_CHECKING from tempfile import gettempdir @@ -13,10 +15,8 @@ from pandas.io.common import get_handle -from pandas.core import generic -from pandas.util._decorators import doc - -from pandas import DataFrame +if TYPE_CHECKING: + from pandas import DataFrame def read_orc( @@ -102,29 +102,19 @@ def to_orc( else: try: assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" - assert hasattr(engine, 'orc'), "'pyarrow' module must have version > 4.0.0 with orc module" + assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module" except Exception as e: - raise ValueError("Wrong engine passed, %s" % ( - e, - )) + raise ValueError("Wrong engine passed, %s" % e) if path is None: # to bytes: tmp path, pyarrow auto closes buffers - path = os.path.join(gettempdir(), os.urandom(12).hex()) - try: + with tm.ensure_clean(os.path.join(gettempdir(), os.urandom(12).hex())) as path: engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), path, **kwargs ) with open(path, 'rb') as path: return path.read() - except BaseException as e: - raise e - finally: - try: - os.remove(path) - except Exception as e: - pass else: engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), From 057160250d4038e3cd35f883e7505c57b4c28fc5 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sun, 21 Nov 2021 04:29:12 -0500 Subject: [PATCH 04/49] Fix style & edit comments & change min dependency version to 5.0.0 --- pandas/core/frame.py | 16 ++++++++-------- pandas/io/orc.py | 45 +++++++++++++++++++++++--------------------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 694cfdf9f8e82..24991bd09e118 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2900,7 +2900,7 @@ def to_parquet( storage_options=storage_options, **kwargs, ) - + def to_orc( self, path: FilePathOrBuffer = None, @@ -2909,7 +2909,7 @@ def to_orc( **kwargs ) -> bytes: """ - Write a DataFrame to the orc/arrow format. + Write a DataFrame to the ORC format. Parameters ---------- df : DataFrame @@ -2917,12 +2917,12 @@ def to_orc( If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle - (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. If path is None, - a bytes object is returned. + (e.g. via builtin open function). If path is None, + a bytes object is returned. Note that currently the pyarrow + engine doesn't work with io.BytesIO. engine : {{'pyarrow'}}, default 'pyarrow' Parquet library to use, or library it self, checked with 'pyarrow' name - and version > 4.0.0 + and version >= 5.0.0 index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -2952,7 +2952,7 @@ def to_orc( Examples -------- >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) - >>> df.to_orc('df.orc', compression='gzip') # doctest: +SKIP + >>> df.to_orc('df.orc') # doctest: +SKIP >>> pd.read_orc('df.orc') # doctest: +SKIP col1 col2 0 1 3 @@ -2974,7 +2974,7 @@ def to_orc( index=index, **kwargs ) - + @Substitution( header_type="bool", header="Whether to print column labels, default True", diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 15161ae202ad3..6664348656c84 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,9 +1,6 @@ """ orc compat """ from __future__ import annotations -import os -import pandas._testing as tm - from typing import TYPE_CHECKING from tempfile import gettempdir @@ -66,7 +63,7 @@ def to_orc( **kwargs ) -> bytes: """ - Write a DataFrame to the orc/arrow format. + Write a DataFrame to the ORC format. Parameters ---------- df : DataFrame @@ -74,12 +71,12 @@ def to_orc( If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle - (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. If path is None, - a bytes object is returned. + (e.g. via builtin open function). If path is None, + a bytes object is returned. Note that currently the pyarrow + engine doesn't work with io.BytesIO. engine : {{'pyarrow'}}, default 'pyarrow' Parquet library to use, or library it self, checked with 'pyarrow' name - and version > 4.0.0 + and version >= 5.0.0 index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -96,25 +93,31 @@ def to_orc( """ if index is None: index = df.index.names[0] is not None - + if isinstance(engine, str): - engine = import_optional_dependency(engine, min_version='4.0.0') + engine = import_optional_dependency(engine, min_version='5.0.0') else: try: assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module" - except Exception as e: - raise ValueError("Wrong engine passed, %s" % e) - + except ImportError as e: + raise ValueError ( + "Unable to find a usable engine; " + "tried using: 'pyarrow'.\n" + "A suitable version of " + "pyarrow is required for ORC support.\n" + "Trying to import the above resulted in these errors:" + f"\n - {e}" + ) + if path is None: - # to bytes: tmp path, pyarrow auto closes buffers - with tm.ensure_clean(os.path.join(gettempdir(), os.urandom(12).hex())) as path: - engine.orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), - path, **kwargs - ) - with open(path, 'rb') as path: - return path.read() + # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer + stream = engine.BufferOutputStream() + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + stream, **kwargs + ) + return stream.getvalue().to_pybytes() else: engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), From d970b5832d73f682dcddc63646cf55669d4d2a0e Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sun, 21 Nov 2021 04:32:15 -0500 Subject: [PATCH 05/49] Fix style & add to see also --- pandas/core/frame.py | 4 +++- pandas/io/orc.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 24991bd09e118..255cd2388dc1b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2858,6 +2858,7 @@ def to_parquet( See Also -------- read_parquet : Read a parquet file. + DataFrame.to_orc : Write an orc file. DataFrame.to_csv : Write a csv file. DataFrame.to_sql : Write to a sql table. DataFrame.to_hdf : Write to hdf. @@ -2947,7 +2948,8 @@ def to_orc( Notes ----- - This function requires `pyarrow `_ library. + This function requires `pyarrow ` + _ library. Examples -------- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 6664348656c84..06a41912a73fa 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -101,14 +101,14 @@ def to_orc( assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module" except ImportError as e: - raise ValueError ( - "Unable to find a usable engine; " - "tried using: 'pyarrow'.\n" - "A suitable version of " - "pyarrow is required for ORC support.\n" - "Trying to import the above resulted in these errors:" - f"\n - {e}" - ) + raise ValueError( + "Unable to find a usable engine; " + "tried using: 'pyarrow'.\n" + "A suitable version of " + "pyarrow is required for ORC support.\n" + "Trying to import the above resulted in these errors:" + f"\n - {e}" + ) if path is None: # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer From 8b12e9f82e70e805881c9e39bccfba06370982a7 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sun, 21 Nov 2021 04:53:23 -0500 Subject: [PATCH 06/49] Add ORC to documentation --- doc/source/reference/frame.rst | 1 + doc/source/reference/io.rst | 1 + doc/source/user_guide/io.rst | 59 +++++++++++++++++++++++++++++++-- doc/source/user_guide/scale.rst | 17 ++++++++++ pandas/core/generic.py | 1 + 5 files changed, 76 insertions(+), 3 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index ea27d1efbb235..e71ee80767d29 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -373,6 +373,7 @@ Serialization / IO / conversion DataFrame.from_dict DataFrame.from_records + DataFrame.to_orc DataFrame.to_parquet DataFrame.to_pickle DataFrame.to_csv diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 70fd381bffd2c..425b5f81be966 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -159,6 +159,7 @@ ORC :toctree: api/ read_orc + DataFrame.to_orc SAS ~~~ diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 4e19deb84487f..f3e712197f9c5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -30,7 +30,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`ORC Format `__;:ref:`read_orc`; + binary;`ORC Format `__;:ref:`read_orc`;:ref:`to_orc` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`SPSS `__;:ref:`read_spss`; @@ -5562,14 +5562,67 @@ ORC .. versionadded:: 1.0.0 Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization -for data frames. It is designed to make reading data frames efficient. pandas provides *only* a reader for the -ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. +for data frames. It is designed to make reading data frames efficient. pandas provides both the reader and the writer for the +ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This requires the `pyarrow `__ library. .. warning:: * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. +.. ipython:: python + + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.Categorical(list("abc")), + "i": pd.Categorical(list("abc"), ordered=True), + } + ) + + df + df.dtypes + +Write to an orc file. + +.. ipython:: python + :okwarning: + + df.to_orc("example_pa.orc", engine="pyarrow") + +Read from an orc file. + +.. ipython:: python + :okwarning: + + result = pd.read_orc("example_pa.orc", engine="pyarrow") + + result.dtypes + +Read only certain columns of an orc file. + +.. ipython:: python + + result = pd.read_orc( + "example_pa.orc", + engine="pyarrow", + columns=["a", "b"], + ) + result.dtypes + + +.. ipython:: python + :suppress: + + os.remove("example_pa.orc") + + .. _io.sql: SQL queries diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 129f43dd36930..cf8a0c9845e62 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -18,6 +18,23 @@ tool for all situations. If you're working with very large datasets and a tool like PostgreSQL fits your needs, then you should probably be using that. Assuming you want or need the expressiveness and power of pandas, let's carry on. +.. ipython:: python + + import pandas as pd + import numpy as np + +.. ipython:: python + :suppress: + + from pandas._testing import _make_timeseries + + # Make a random in-memory dataset + ts = _make_timeseries(freq="30S", seed=0) + ts.to_csv("timeseries.csv") + ts.to_orc("timeseries.orc") + ts.to_parquet("timeseries.parquet") + + Load less data -------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 89a590f291356..78edaf15fe7ce 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2629,6 +2629,7 @@ def to_hdf( See Also -------- read_hdf : Read from HDF file. + DataFrame.to_orc : Write a DataFrame to the binary orc format. DataFrame.to_parquet : Write a DataFrame to the binary parquet format. DataFrame.to_sql : Write to a SQL table. DataFrame.to_feather : Write out feather-format for DataFrames. From 65e6b7a0d1ff00ffe7dd9cdac3420f874eacea82 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sun, 21 Nov 2021 21:28:35 -0500 Subject: [PATCH 07/49] Changes according to review --- pandas/core/frame.py | 4 ++-- pandas/io/orc.py | 29 +++++++++++++++++++---------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 255cd2388dc1b..fc078cd29cf9d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2904,8 +2904,8 @@ def to_parquet( def to_orc( self, - path: FilePathOrBuffer = None, - engine: str = 'pyarrow', + path: FilePath | WriteBuffer[bytes] | None = None, + engine: Literal['pyarrow'] = 'pyarrow', index: bool = None, **kwargs ) -> bytes: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 06a41912a73fa..f352a54b1fc2a 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,12 +1,17 @@ """ orc compat """ from __future__ import annotations -from typing import TYPE_CHECKING +from types import ModuleType +from typing import ( + TYPE_CHECKING, + Literal, +) from tempfile import gettempdir from pandas._typing import ( FilePath, ReadBuffer, + WriteBuffer, ) from pandas.compat._optional import import_optional_dependency @@ -57,8 +62,8 @@ def read_orc( def to_orc( df: DataFrame, - path: FilePathOrBuffer = None, - engine: str = 'pyarrow', + path: FilePath | WriteBuffer[bytes] | None = None, + engine: Literal['pyarrow'] = 'pyarrow', # type: ignore[arg-type] index: bool = None, **kwargs ) -> bytes: @@ -96,7 +101,7 @@ def to_orc( if isinstance(engine, str): engine = import_optional_dependency(engine, min_version='5.0.0') - else: + elif isinstance(engine, ModuleType): try: assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module" @@ -109,18 +114,22 @@ def to_orc( "Trying to import the above resulted in these errors:" f"\n - {e}" ) + else: + raise TypeError( + f"unsuported type for engine: {type(engine)}" + ) - if path is None: - # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer - stream = engine.BufferOutputStream() + if hasattr(path, "write"): engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), - stream, **kwargs + path, **kwargs ) - return stream.getvalue().to_pybytes() else: + # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer + stream = engine.BufferOutputStream() engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), - path, **kwargs + stream, **kwargs ) + return stream.getvalue().to_pybytes() return From 2114616e4313a86c43761500253d4171d9282a64 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Tue, 23 Nov 2021 21:48:33 -0500 Subject: [PATCH 08/49] Fix problems mentioned in comment --- pandas/core/frame.py | 2 +- pandas/io/orc.py | 44 +++++++++++++++++--------------------------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fc078cd29cf9d..49ba0f4cbba5f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2922,7 +2922,7 @@ def to_orc( a bytes object is returned. Note that currently the pyarrow engine doesn't work with io.BytesIO. engine : {{'pyarrow'}}, default 'pyarrow' - Parquet library to use, or library it self, checked with 'pyarrow' name + ORC library to use, or library itself, checked with 'pyarrow' name and version >= 5.0.0 index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If diff --git a/pandas/io/orc.py b/pandas/io/orc.py index f352a54b1fc2a..c919867811752 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,7 +1,6 @@ """ orc compat """ from __future__ import annotations -from types import ModuleType from typing import ( TYPE_CHECKING, Literal, @@ -63,7 +62,7 @@ def read_orc( def to_orc( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, - engine: Literal['pyarrow'] = 'pyarrow', # type: ignore[arg-type] + engine: Literal['pyarrow'] = 'pyarrow', index: bool = None, **kwargs ) -> bytes: @@ -99,37 +98,28 @@ def to_orc( if index is None: index = df.index.names[0] is not None - if isinstance(engine, str): - engine = import_optional_dependency(engine, min_version='5.0.0') - elif isinstance(engine, ModuleType): - try: - assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" - assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module" - except ImportError as e: - raise ValueError( - "Unable to find a usable engine; " - "tried using: 'pyarrow'.\n" - "A suitable version of " - "pyarrow is required for ORC support.\n" - "Trying to import the above resulted in these errors:" - f"\n - {e}" - ) + if engine == "pyarrow": + engine = import_optional_dependency(engine, min_version='5.0.0') ) else: - raise TypeError( - f"unsuported type for engine: {type(engine)}" + raise ValueError( + f"engine must be 'pyarrow'" ) - if hasattr(path, "write"): + if not hasattr(path, "write"): engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), path, **kwargs ) else: # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer - stream = engine.BufferOutputStream() - engine.orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), - stream, **kwargs - ) - return stream.getvalue().to_pybytes() - return + with engine.BufferOutputStream() as stream: # if that is possible + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + stream, **kwargs + ) + # allows writing to any (fsspec) URL + with get_handle(path, "wb", is_text=False) as handles: + orc_bytes = stream.getvalue().to_pybytes() + handles.handle.write(orc_bytes) + if path is None: + return orc_bytes From e4b40ef861dbccbfa31eb2c9ba277f766b764ad9 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Tue, 23 Nov 2021 22:11:03 -0500 Subject: [PATCH 09/49] Linter compliance --- pandas/io/orc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index c919867811752..81721c8b02c80 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -99,7 +99,7 @@ def to_orc( index = df.index.names[0] is not None if engine == "pyarrow": - engine = import_optional_dependency(engine, min_version='5.0.0') ) + engine = import_optional_dependency(engine, min_version='5.0.0') else: raise ValueError( f"engine must be 'pyarrow'" From a7aa3e0d409cadce7f3c1f325e142ddc57e03e68 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Wed, 24 Nov 2021 05:54:12 -0500 Subject: [PATCH 10/49] Address comments --- pandas/io/orc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 81721c8b02c80..bedc7580d698a 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -105,7 +105,7 @@ def to_orc( f"engine must be 'pyarrow'" ) - if not hasattr(path, "write"): + if hasattr(path, "write"): engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), path, **kwargs @@ -117,9 +117,9 @@ def to_orc( engine.Table.from_pandas(df, preserve_index=index), stream, **kwargs ) + orc_bytes = stream.getvalue().to_pybytes() + if path is None: + return orc_bytes # allows writing to any (fsspec) URL with get_handle(path, "wb", is_text=False) as handles: - orc_bytes = stream.getvalue().to_pybytes() handles.handle.write(orc_bytes) - if path is None: - return orc_bytes From 1ab9b6c836a44e73c3ccf348d93f1a99652b134b Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Thu, 2 Dec 2021 06:36:29 -0500 Subject: [PATCH 11/49] Add orc test --- pandas/io/orc.py | 6 +++--- pandas/tests/io/test_orc.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index bedc7580d698a..02bf9f70406dc 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -98,12 +98,12 @@ def to_orc( if index is None: index = df.index.names[0] is not None - if engine == "pyarrow": - engine = import_optional_dependency(engine, min_version='5.0.0') - else: + if engine != "pyarrow": raise ValueError( f"engine must be 'pyarrow'" ) + engine = import_optional_dependency(engine, min_version='5.0.0') + if hasattr(path, "write"): engine.orc.write_table( diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index f34e9b940317d..211352cebcb73 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -224,3 +224,24 @@ def test_orc_reader_snappy_compressed(dirpath): got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got) + + +def test_orc_roundtrip(dirpath): + data = { + "boolean1": np.array([False, True], dtype="bool"), + "byte1": np.array([1, 100], dtype="int8"), + "short1": np.array([1024, 2048], dtype="int16"), + "int1": np.array([65536, 65536], dtype="int32"), + "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), + "float1": np.array([1.0, 2.0], dtype="float32"), + "double1": np.array([-15.0, -5.0], dtype="float64"), + "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + "string1": np.array(["hi", "bye"], dtype="object"), + } + expected = pd.DataFrame.from_dict(data) + + outputfile = os.path.join(dirpath, "TestOrcFile.testReadWrite.orc") + expected.to_orc(outputfile) + got = read_orc(outputfile) + + tm.assert_equal(expected, got) From 96969d50bf12f35f368065062e5719c88e05568a Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Fri, 3 Dec 2021 07:58:47 +0000 Subject: [PATCH 12/49] Fixes from pre-commit [automated commit] --- pandas/core/frame.py | 12 +++--------- pandas/io/orc.py | 19 +++++++------------ 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 49ba0f4cbba5f..9a3e2ddc6b463 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2905,9 +2905,9 @@ def to_parquet( def to_orc( self, path: FilePath | WriteBuffer[bytes] | None = None, - engine: Literal['pyarrow'] = 'pyarrow', + engine: Literal["pyarrow"] = "pyarrow", index: bool = None, - **kwargs + **kwargs, ) -> bytes: """ Write a DataFrame to the ORC format. @@ -2969,13 +2969,7 @@ def to_orc( """ from pandas.io.orc import to_orc - return to_orc( - self, - path, - engine, - index=index, - **kwargs - ) + return to_orc(self, path, engine, index=index, **kwargs) @Substitution( header_type="bool", diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 02bf9f70406dc..526124e209fa7 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,11 +1,11 @@ """ orc compat """ from __future__ import annotations +from tempfile import gettempdir from typing import ( TYPE_CHECKING, Literal, ) -from tempfile import gettempdir from pandas._typing import ( FilePath, @@ -62,9 +62,9 @@ def read_orc( def to_orc( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, - engine: Literal['pyarrow'] = 'pyarrow', + engine: Literal["pyarrow"] = "pyarrow", index: bool = None, - **kwargs + **kwargs, ) -> bytes: """ Write a DataFrame to the ORC format. @@ -99,23 +99,18 @@ def to_orc( index = df.index.names[0] is not None if engine != "pyarrow": - raise ValueError( - f"engine must be 'pyarrow'" - ) - engine = import_optional_dependency(engine, min_version='5.0.0') - + raise ValueError(f"engine must be 'pyarrow'") + engine = import_optional_dependency(engine, min_version="5.0.0") if hasattr(path, "write"): engine.orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), - path, **kwargs + engine.Table.from_pandas(df, preserve_index=index), path, **kwargs ) else: # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer with engine.BufferOutputStream() as stream: # if that is possible engine.orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), - stream, **kwargs + engine.Table.from_pandas(df, preserve_index=index), stream, **kwargs ) orc_bytes = stream.getvalue().to_pybytes() if path is None: From 2a54b8c11beb956c8e59095aecb7608cb002d095 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sun, 20 Mar 2022 00:44:02 -0400 Subject: [PATCH 13/49] Fix issues according to comments --- pandas/core/frame.py | 3 +-- pandas/io/orc.py | 9 ++++----- pandas/tests/io/test_orc.py | 3 +++ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9a3e2ddc6b463..b300b8c714a1c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2919,8 +2919,7 @@ def to_orc( when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, - a bytes object is returned. Note that currently the pyarrow - engine doesn't work with io.BytesIO. + a bytes object is returned. engine : {{'pyarrow'}}, default 'pyarrow' ORC library to use, or library itself, checked with 'pyarrow' name and version >= 5.0.0 diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 526124e209fa7..2d89573982b39 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -76,8 +76,7 @@ def to_orc( when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, - a bytes object is returned. Note that currently the pyarrow - engine doesn't work with io.BytesIO. + a bytes object is returned. engine : {{'pyarrow'}}, default 'pyarrow' Parquet library to use, or library it self, checked with 'pyarrow' name and version >= 5.0.0 @@ -100,7 +99,7 @@ def to_orc( if engine != "pyarrow": raise ValueError(f"engine must be 'pyarrow'") - engine = import_optional_dependency(engine, min_version="5.0.0") + engine = import_optional_dependency(engine, min_version="4.0.1") if hasattr(path, "write"): engine.orc.write_table( @@ -112,9 +111,9 @@ def to_orc( engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), stream, **kwargs ) - orc_bytes = stream.getvalue().to_pybytes() + orc_bytes = stream.getvalue() if path is None: - return orc_bytes + return orc_bytes.to_pybytes() # allows writing to any (fsspec) URL with get_handle(path, "wb", is_text=False) as handles: handles.handle.write(orc_bytes) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 211352cebcb73..986f02fb9a215 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -227,6 +227,9 @@ def test_orc_reader_snappy_compressed(dirpath): def test_orc_roundtrip(dirpath): + # GH44554 + # PyArrow gained ORC write support with the current argument order + pytest.importorskip("pyarrow", minversion="7.0.0") data = { "boolean1": np.array([False, True], dtype="bool"), "byte1": np.array([1, 100], dtype="int8"), From 1caec9ee5661d8f7d1afaea81c88dc6ef89ba493 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Mon, 21 Mar 2022 04:57:49 -0400 Subject: [PATCH 14/49] Simplify the code base after raising Arrow version to 7.0.0 --- pandas/io/orc.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 2d89573982b39..21af6fe9fb84b 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,7 +1,6 @@ """ orc compat """ from __future__ import annotations -from tempfile import gettempdir from typing import ( TYPE_CHECKING, Literal, @@ -79,7 +78,7 @@ def to_orc( a bytes object is returned. engine : {{'pyarrow'}}, default 'pyarrow' Parquet library to use, or library it self, checked with 'pyarrow' name - and version >= 5.0.0 + and version >= 7.0.0 index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -98,22 +97,9 @@ def to_orc( index = df.index.names[0] is not None if engine != "pyarrow": - raise ValueError(f"engine must be 'pyarrow'") - engine = import_optional_dependency(engine, min_version="4.0.1") - - if hasattr(path, "write"): - engine.orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), path, **kwargs - ) - else: - # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer - with engine.BufferOutputStream() as stream: # if that is possible - engine.orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), stream, **kwargs - ) - orc_bytes = stream.getvalue() - if path is None: - return orc_bytes.to_pybytes() - # allows writing to any (fsspec) URL - with get_handle(path, "wb", is_text=False) as handles: - handles.handle.write(orc_bytes) + raise ValueError("engine must be 'pyarrow'") + engine = import_optional_dependency(engine, min_version="7.0.0") + + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), path, **kwargs + ) From 6f0a5380c08c6972bf6c7213bf22fcce3463f6bd Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Mon, 21 Mar 2022 05:36:01 -0400 Subject: [PATCH 15/49] Fix min arrow version in to_orc --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b300b8c714a1c..e95ca119e6057 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2922,7 +2922,7 @@ def to_orc( a bytes object is returned. engine : {{'pyarrow'}}, default 'pyarrow' ORC library to use, or library itself, checked with 'pyarrow' name - and version >= 5.0.0 + and version >= 7.0.0 index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -2952,7 +2952,7 @@ def to_orc( Examples -------- - >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_orc('df.orc') # doctest: +SKIP >>> pd.read_orc('df.orc') # doctest: +SKIP col1 col2 From ae65214a58f8eef63166119dbc5c990e8f1e7119 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Mon, 21 Mar 2022 05:44:43 -0400 Subject: [PATCH 16/49] Add to_orc test in line with other formats --- pandas/tests/io/test_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index fc605637dbc11..66905d7b7112f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -222,6 +222,7 @@ def test_read_non_existent(self, reader, module, error_class, fn_ext): (pd.DataFrame.to_html, "os", OSError, "html"), (pd.DataFrame.to_excel, "xlrd", OSError, "xlsx"), (pd.DataFrame.to_feather, "pyarrow", OSError, "feather"), + (pd.DataFrame.to_orc, "pyarrow", OSError, "orc"), (pd.DataFrame.to_parquet, "pyarrow", OSError, "parquet"), (pd.DataFrame.to_stata, "os", OSError, "dta"), (pd.DataFrame.to_json, "os", OSError, "json"), From 045c411d8640a002e2463c1df1b0ced498ca3bd9 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Tue, 22 Mar 2022 02:27:27 -0400 Subject: [PATCH 17/49] Add BytesIO support & test --- doc/source/user_guide/scale.rst | 17 ----------------- pandas/io/orc.py | 11 ++++++++++- pandas/tests/io/test_orc.py | 25 ++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index cf8a0c9845e62..129f43dd36930 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -18,23 +18,6 @@ tool for all situations. If you're working with very large datasets and a tool like PostgreSQL fits your needs, then you should probably be using that. Assuming you want or need the expressiveness and power of pandas, let's carry on. -.. ipython:: python - - import pandas as pd - import numpy as np - -.. ipython:: python - :suppress: - - from pandas._testing import _make_timeseries - - # Make a random in-memory dataset - ts = _make_timeseries(freq="30S", seed=0) - ts.to_csv("timeseries.csv") - ts.to_orc("timeseries.orc") - ts.to_parquet("timeseries.parquet") - - Load less data -------------- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 21af6fe9fb84b..08645a87f09dd 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,6 +1,7 @@ """ orc compat """ from __future__ import annotations +import io from typing import ( TYPE_CHECKING, Literal, @@ -100,6 +101,14 @@ def to_orc( raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") + path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path engine.orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), path, **kwargs + engine.Table.from_pandas(df, preserve_index=index), path_or_buf, **kwargs ) + + if path is None: + assert isinstance(path_or_buf, io.BytesIO) + return path_or_buf.getvalue() + else: + return None + diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 986f02fb9a215..2eeed0adc379c 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -226,7 +226,7 @@ def test_orc_reader_snappy_compressed(dirpath): tm.assert_equal(expected, got) -def test_orc_roundtrip(dirpath): +def test_orc_roundtrip_file(dirpath): # GH44554 # PyArrow gained ORC write support with the current argument order pytest.importorskip("pyarrow", minversion="7.0.0") @@ -248,3 +248,26 @@ def test_orc_roundtrip(dirpath): got = read_orc(outputfile) tm.assert_equal(expected, got) + + +def test_orc_roundtrip_bytesio(): + # GH44554 + # PyArrow gained ORC write support with the current argument order + pytest.importorskip("pyarrow", minversion="7.0.0") + data = { + "boolean1": np.array([False, True], dtype="bool"), + "byte1": np.array([1, 100], dtype="int8"), + "short1": np.array([1024, 2048], dtype="int16"), + "int1": np.array([65536, 65536], dtype="int32"), + "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), + "float1": np.array([1.0, 2.0], dtype="float32"), + "double1": np.array([-15.0, -5.0], dtype="float64"), + "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + "string1": np.array(["hi", "bye"], dtype="object"), + } + expected = pd.DataFrame.from_dict(data) + + bytesio = expected.to_orc() + got = read_orc(bytesio) + + tm.assert_equal(expected, got) From c00ed0f039594d48fe80243afed27882b9dbf33e Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Tue, 22 Mar 2022 03:16:12 -0400 Subject: [PATCH 18/49] Fix some docs issues --- pandas/core/frame.py | 8 ++++---- pandas/io/orc.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e95ca119e6057..14d0e052a0f8f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2906,15 +2906,15 @@ def to_orc( self, path: FilePath | WriteBuffer[bytes] | None = None, engine: Literal["pyarrow"] = "pyarrow", - index: bool = None, + index: bool | None = None, **kwargs, - ) -> bytes: + ) -> bytes | None: """ Write a DataFrame to the ORC format. Parameters ---------- df : DataFrame - path : str or file-like object, default None + path : str, file-like object or None, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle @@ -2923,7 +2923,7 @@ def to_orc( engine : {{'pyarrow'}}, default 'pyarrow' ORC library to use, or library itself, checked with 'pyarrow' name and version >= 7.0.0 - index : bool, default None + index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, similar to ``infer`` the dataframe's index(es) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 08645a87f09dd..61d7cdbccd53a 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -63,15 +63,15 @@ def to_orc( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, engine: Literal["pyarrow"] = "pyarrow", - index: bool = None, + index: bool | None = None, **kwargs, -) -> bytes: +) -> bytes | None: """ Write a DataFrame to the ORC format. Parameters ---------- df : DataFrame - path : str or file-like object, default None + path : str, file-like object or None, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle @@ -80,7 +80,7 @@ def to_orc( engine : {{'pyarrow'}}, default 'pyarrow' Parquet library to use, or library it self, checked with 'pyarrow' name and version >= 7.0.0 - index : bool, default None + index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, similar to ``infer`` the dataframe's index(es) From fe275d7f21390127414905a1eb4c3791c6d98663 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Fri, 25 Mar 2022 16:29:56 -0400 Subject: [PATCH 19/49] Use keyword only arguments --- pandas/core/frame.py | 1 + pandas/io/orc.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 14d0e052a0f8f..97661df2cef61 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2905,6 +2905,7 @@ def to_parquet( def to_orc( self, path: FilePath | WriteBuffer[bytes] | None = None, + *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, **kwargs, diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 61d7cdbccd53a..f49579425b387 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -62,6 +62,7 @@ def read_orc( def to_orc( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, + *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, **kwargs, @@ -109,6 +110,5 @@ def to_orc( if path is None: assert isinstance(path_or_buf, io.BytesIO) return path_or_buf.getvalue() - else: - return None + return None From 9d3e0dfd464e41224f3a7a47d3b344b51d562f0d Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Thu, 12 May 2022 01:24:53 -0400 Subject: [PATCH 20/49] Fix bug --- pandas/io/orc.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index f49579425b387..d3a683ae93aa2 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -102,13 +102,14 @@ def to_orc( raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") - path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path - engine.orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), path_or_buf, **kwargs - ) - - if path is None: - assert isinstance(path_or_buf, io.BytesIO) - return path_or_buf.getvalue() + was_none = path is None + if was_none: + path = io.BytesIO() + with get_handle(path, "wb") as handles: + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs + ) + + if was_none: + return path.getvalue() return None - From 971f31c14abce5fdd03e813619fc07b2bbe2f4d8 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sat, 28 May 2022 22:24:18 -0400 Subject: [PATCH 21/49] Fix param issue --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97661df2cef61..aed78178ffbfd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2969,7 +2969,7 @@ def to_orc( """ from pandas.io.orc import to_orc - return to_orc(self, path, engine, index=index, **kwargs) + return to_orc(self, path, engine=engine, index=index, **kwargs) @Substitution( header_type="bool", From 52b68a0f8eeaa1cbbc50d92cea8b6baf765e0171 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sat, 28 May 2022 23:09:31 -0400 Subject: [PATCH 22/49] Doctest skipping due to minimal versions --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aed78178ffbfd..14a0b52308e59 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2962,7 +2962,7 @@ def to_orc( If you want to get a buffer to the orc content you can write it to io.BytesIO >>> import io - >>> b = io.BytesIO(df.to_orc()) + >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP >>> b.seek(0) 0 >>> content = b.read() From 76437ba361b014dd998d9ae1d33b40a72f19b538 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sat, 28 May 2022 23:28:12 -0400 Subject: [PATCH 23/49] Doctest skipping due to minimal versions --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 14a0b52308e59..7f17df9b9580f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2963,9 +2963,9 @@ def to_orc( If you want to get a buffer to the orc content you can write it to io.BytesIO >>> import io >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP - >>> b.seek(0) + >>> b.seek(0) # doctest: +SKIP 0 - >>> content = b.read() + >>> content = b.read() # doctest: +SKIP """ from pandas.io.orc import to_orc From c5d585267f2bcd76e894e1134a34e494867cea76 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sun, 29 May 2022 00:40:41 -0400 Subject: [PATCH 24/49] Improve spacing in docstring & remove orc test in test_common that has unusual pyarrow version requirement and is with a lot of other tests --- pandas/core/frame.py | 2 ++ pandas/tests/io/test_common.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f17df9b9580f..2d00857a14895 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2912,6 +2912,7 @@ def to_orc( ) -> bytes | None: """ Write a DataFrame to the ORC format. + Parameters ---------- df : DataFrame @@ -2934,6 +2935,7 @@ def to_orc( be included as columns in the file output. kwargs Additional keyword arguments passed to the engine + Returns ------- bytes if no path argument is provided else None diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 66905d7b7112f..fc605637dbc11 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -222,7 +222,6 @@ def test_read_non_existent(self, reader, module, error_class, fn_ext): (pd.DataFrame.to_html, "os", OSError, "html"), (pd.DataFrame.to_excel, "xlrd", OSError, "xlsx"), (pd.DataFrame.to_feather, "pyarrow", OSError, "feather"), - (pd.DataFrame.to_orc, "pyarrow", OSError, "orc"), (pd.DataFrame.to_parquet, "pyarrow", OSError, "parquet"), (pd.DataFrame.to_stata, "os", OSError, "dta"), (pd.DataFrame.to_json, "os", OSError, "json"), From b5cd02212be25297c0bcb9e8b114bd21c80ce99e Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Sun, 29 May 2022 01:28:37 -0400 Subject: [PATCH 25/49] Fix docstring syntax --- pandas/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d00857a14895..b7492d9a31bb1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2924,7 +2924,7 @@ def to_orc( a bytes object is returned. engine : {{'pyarrow'}}, default 'pyarrow' ORC library to use, or library itself, checked with 'pyarrow' name - and version >= 7.0.0 + and version >= 7.0.0. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -2933,8 +2933,8 @@ def to_orc( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - kwargs - Additional keyword arguments passed to the engine + **kwargs + Additional keyword arguments passed to the engine. Returns ------- From 7ad3df937c872849806d43428792884beca6aed5 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 02:21:56 -0400 Subject: [PATCH 26/49] ORC is not text --- pandas/io/orc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index d3a683ae93aa2..635d81a112dd4 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -105,7 +105,7 @@ def to_orc( was_none = path is None if was_none: path = io.BytesIO() - with get_handle(path, "wb") as handles: + with get_handle(path, "wb", is_text=False) as handles: engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs ) From a73bb706e190eebfd9e6e4274064503cb2d6f8c0 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 04:48:10 -0400 Subject: [PATCH 27/49] Fix BytesIO bug && do not require orc to be explicitly imported before usage && all pytest tests have passed --- pandas/io/orc.py | 7 +++++-- pandas/tests/io/test_orc.py | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 635d81a112dd4..e2b63eaaedadf 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -69,6 +69,7 @@ def to_orc( ) -> bytes | None: """ Write a DataFrame to the ORC format. + Parameters ---------- df : DataFrame @@ -89,8 +90,9 @@ def to_orc( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - kwargs + **kwargs Additional keyword arguments passed to the engine + Returns ------- bytes if no path argument is provided else None @@ -101,12 +103,13 @@ def to_orc( if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") + orc = import_optional_dependency("pyarrow.orc") was_none = path is None if was_none: path = io.BytesIO() with get_handle(path, "wb", is_text=False) as handles: - engine.orc.write_table( + orc.write_table( engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs ) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 2eeed0adc379c..826514d2615a8 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -1,5 +1,6 @@ """ test orc compat """ import datetime +from io import BytesIO import os import numpy as np @@ -267,7 +268,7 @@ def test_orc_roundtrip_bytesio(): } expected = pd.DataFrame.from_dict(data) - bytesio = expected.to_orc() - got = read_orc(bytesio) + bytes = expected.to_orc() + got = read_orc(BytesIO(bytes)) tm.assert_equal(expected, got) From 20aefe79ed4bf2fb4a77f2858fbb5b678895ebc7 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 10:43:25 -0400 Subject: [PATCH 28/49] ORC writer does not work for categorical columns yet --- doc/source/user_guide/io.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f3e712197f9c5..e0999d1ef85ce 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5569,6 +5569,7 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. + * Categorical columns are not supported yet. .. ipython:: python @@ -5581,8 +5582,6 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This "e": [True, False, True], "f": pd.date_range("20130101", periods=3), "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), - "h": pd.Categorical(list("abc")), - "i": pd.Categorical(list("abc"), ordered=True), } ) From e7e81fee7a23f30946169613139880aa84b104ee Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 10:49:34 -0400 Subject: [PATCH 29/49] Appease mypy --- pandas/io/orc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index e2b63eaaedadf..356a82d2947ab 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -108,11 +108,13 @@ def to_orc( was_none = path is None if was_none: path = io.BytesIO() + assert path is not None # For mypy with get_handle(path, "wb", is_text=False) as handles: orc.write_table( engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs ) if was_none: + assert isinstance(path, io.BytesIO) # For mypy return path.getvalue() return None From 6b659f7007d10a2ecab925988fb6e5b6cf8a446e Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 15:18:55 -0400 Subject: [PATCH 30/49] Appease mypy --- pandas/io/orc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 356a82d2947ab..918f75de00c58 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -2,6 +2,7 @@ from __future__ import annotations import io +from types import ModuleType from typing import ( TYPE_CHECKING, Literal, @@ -110,6 +111,7 @@ def to_orc( path = io.BytesIO() assert path is not None # For mypy with get_handle(path, "wb", is_text=False) as handles: + assert isinstance(engine, ModuleType) # For mypy orc.write_table( engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs ) From 18e5429968c7e7ad653cdf46c13ae863efaaa203 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 20:23:11 -0400 Subject: [PATCH 31/49] Edit according to reviews --- doc/source/user_guide/io.rst | 11 ++--- pandas/core/frame.py | 30 +++++++++--- pandas/io/orc.py | 45 +++++++++++++++++- .../io/data/orc/TestOrcFile.testReadWrite.orc | Bin 0 -> 1344 bytes pandas/tests/io/test_orc.py | 41 ++++++++++++++++ 5 files changed, 112 insertions(+), 15 deletions(-) create mode 100644 pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e0999d1ef85ce..a7f26e53620f8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5569,7 +5569,8 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. - * Categorical columns are not supported yet. + * Unsigned integers, intervals, periods, sparse and categorical Dtypes are not supported yet. + * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. .. ipython:: python @@ -5577,11 +5578,9 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This { "a": list("abc"), "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.date_range("20130101", periods=3), - "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "c": np.arange(4.0, 7.0, dtype="float64"), + "d": [True, False, True], + "e": pd.date_range("20130101", periods=3), } ) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b7492d9a31bb1..20b130191e0b9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2913,9 +2913,10 @@ def to_orc( """ Write a DataFrame to the ORC format. + .. versionadded:: 1.5.0 + Parameters ---------- - df : DataFrame path : str, file-like object or None, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, @@ -2923,23 +2924,32 @@ def to_orc( (e.g. via builtin open function). If path is None, a bytes object is returned. engine : {{'pyarrow'}}, default 'pyarrow' - ORC library to use, or library itself, checked with 'pyarrow' name - and version >= 7.0.0. + ORC library to use, or library it self, checked with 'pyarrow' name + and version >= 7.0.0. Raises ValueError if it is anything but + 'pyarrow'. index : bool, optional - If ``True``, include the dataframe's index(es) in the file output. If - ``False``, they will not be written to the file. + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. If ``None``, similar to ``infer`` the dataframe's index(es) will be saved. However, instead of being saved as values, the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. **kwargs - Additional keyword arguments passed to the engine. + Additional keyword arguments passed to the engine Returns ------- bytes if no path argument is provided else None + Raises + ------ + NotImplementedError + * Dtype of one or more columns is unsigned integers, intervals, + periods, sparse or categorical. + ValueError + * engine is not pyarrow. + See Also -------- read_orc : Read a ORC file. @@ -2950,8 +2960,14 @@ def to_orc( Notes ----- - This function requires `pyarrow ` + * Before using this function you should read the :ref:`user guide about + ORC ` and :ref:`install optional dependencies `. + * This function requires `pyarrow ` _ library. + * Unsigned integers, intervals, periods, sparse and categorical Dtypes + are not supported yet. + * Currently timezones in datetime columns are not preserved when a + dataframe is converted into ORC files. Examples -------- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 918f75de00c58..bc14a90d463cf 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -71,9 +71,14 @@ def to_orc( """ Write a DataFrame to the ORC format. + .. versionadded:: 1.5.0 + Parameters ---------- df : DataFrame + The dataframe to be written to ORC. Raises NotImplementedError + if dtype of one or more columns is category, unsigned integers, + intervals, periods or sparse. path : str, file-like object or None, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, @@ -81,8 +86,9 @@ def to_orc( (e.g. via builtin open function). If path is None, a bytes object is returned. engine : {{'pyarrow'}}, default 'pyarrow' - Parquet library to use, or library it self, checked with 'pyarrow' name - and version >= 7.0.0 + ORC library to use, or library it self, checked with 'pyarrow' name + and version >= 7.0.0. Raises ValueError if it is anything but + 'pyarrow'. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -97,10 +103,45 @@ def to_orc( Returns ------- bytes if no path argument is provided else None + + Raises + ------ + NotImplementedError + * Dtype of one or more columns is unsigned integers, intervals, + periods, sparse or categorical. + ValueError + * engine is not pyarrow. + + Notes + ----- + * Before using this function you should read the + :ref:`user guide about ORC ` and + :ref:`install optional dependencies `. + * This function requires `pyarrow ` + _ library. + * Unsigned integers, intervals, periods, sparse and categorical Dtypes + are not supported yet. + * Currently timezones in datetime columns are not preserved when a + dataframe is converted into ORC files. """ if index is None: index = df.index.names[0] is not None + # If unsupported dtypes are found raise NotImplementedError + for dtype in df.dtypes: + dtype_str = dtype.__str__().lower() + if ( + "category" in dtype_str + or "interval" in dtype_str + or "sparse" in dtype_str + or "period" in dtype_str + or "uint" in dtype_str + ): + raise NotImplementedError( + """The dtype of one or more columns is unsigned integers, +intervals, periods, sparse or categorical which is not supported yet.""" + ) + if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") diff --git a/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc b/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc new file mode 100644 index 0000000000000000000000000000000000000000..852360ecad74cb3545fd160f895fca67d68ef276 GIT binary patch literal 1344 zcma)6&5qJg6uv*b?P*25Q&BHP6C{}oYC<5Im~=B_CQ&y!X5!Ao1TrWjgeC(nW@8$k zz&FrGaI35CH1jkrTvHFXmkRfFi(;;QU=Mge zDVnzfQFzyYg#b+cqUwAes6}uc10PZ;)0#?Km{b=@h)D_36ePY?no10J7vUWYZ^0}# z4=R@w%kBq`?&Kb@sOswu$nX7~f{;$7Akev>w3(UGa%M^&XIL8AE5s`~U=W~JwcZY} zI^K}pSwCchp~5iF@Y%_;89m9fw-ks(YfKp-_{y`7)HJ_neU@j0r2E($eyC`%X|SB- z*;Bl4lyN7|4(PYR5WgsJ!IeZ^)kxJ%jZ9TEausQG)Lo52?P+w?Lyew6P-U%sU57m? zhMGvKks2hD8O;>eIX9XsS8K;;M}gMDXrb`F;rU#nU6Xr8drImYv}O|Y`6;5%zHfZr z_`VXjaoFeDk!ff3xLRZVh(CMrI`~P0Vry4+b$vxQHgnrTUcQI_v45*27wp|~+ctL$ zxcnia62e!ey0!1D!Rl|I6#DZ`5v~V*1id5~^}FMCBzkAlemim}gXDDDj-25bk7AUJ z-=q0%lyp&gC&`E2XkIlho}e+Ao({31MjE6=>Hx$tY8kVPTjp3MEORXb%e;cMSgc19 zFGmuujU-+lNxWqw@uee)SMT@szl48EwvQyXBZ)Uh()s-IIqivd^GWj&eBtiuK SNU3p7TkfYnoF5$@R{jEaEZds^ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 826514d2615a8..5f075a118017c 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -272,3 +272,44 @@ def test_orc_roundtrip_bytesio(): got = read_orc(BytesIO(bytes)) tm.assert_equal(expected, got) + + +testdata = [ + (pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), dirpath), + (pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), dirpath), + ( + pd.DataFrame( + {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]} + ), + dirpath, + ), + ( + pd.DataFrame( + { + "unimpl": [ + pd.Period("2022-01-03", freq="D"), + pd.Period("2022-01-04", freq="D"), + ] + } + ), + dirpath, + ), + ( + pd.DataFrame({"unimpl": [np.nan] * 100}).astype( + pd.SparseDtype("float", np.nan) + ), + dirpath, + ), +] + + +@pytest.mark.parametrize("unimplemented, dirpath", testdata) +def test_orc_writer_unimplemented_dtypes(unimplemented, dirpath): + # GH44554 + # PyArrow gained ORC write support with the current argument order + pytest.importorskip("pyarrow", minversion="7.0.0") + outputfile = os.path.join(dirpath, "TestOrcFile.testReadWrite.orc") + msg = """The dtype of one or more columns is unsigned integers, +intervals, periods, sparse or categorical which is not supported yet.""" + with pytest.raises(NotImplementedError, match=msg): + unimplemented.to_orc(outputfile) From 21cba6ed7e9196dd24cf2e6509ab6f44ad47e8eb Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 20:30:06 -0400 Subject: [PATCH 32/49] Fix path bug in test_orc --- pandas/tests/io/test_orc.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 5f075a118017c..d0fe5325a1c44 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -275,13 +275,12 @@ def test_orc_roundtrip_bytesio(): testdata = [ - (pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), dirpath), - (pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), dirpath), + (pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")})), + (pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")})), ( pd.DataFrame( {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]} ), - dirpath, ), ( pd.DataFrame( @@ -292,24 +291,17 @@ def test_orc_roundtrip_bytesio(): ] } ), - dirpath, - ), - ( - pd.DataFrame({"unimpl": [np.nan] * 100}).astype( - pd.SparseDtype("float", np.nan) - ), - dirpath, ), + (pd.DataFrame({"unimpl": [np.nan] * 100}).astype(pd.SparseDtype("float", np.nan)),), ] -@pytest.mark.parametrize("unimplemented, dirpath", testdata) -def test_orc_writer_unimplemented_dtypes(unimplemented, dirpath): +@pytest.mark.parametrize("unimplemented", testdata) +def test_orc_writer_unimplemented_dtypes(unimplemented): # GH44554 # PyArrow gained ORC write support with the current argument order pytest.importorskip("pyarrow", minversion="7.0.0") - outputfile = os.path.join(dirpath, "TestOrcFile.testReadWrite.orc") msg = """The dtype of one or more columns is unsigned integers, intervals, periods, sparse or categorical which is not supported yet.""" with pytest.raises(NotImplementedError, match=msg): - unimplemented.to_orc(outputfile) + unimplemented.to_orc() From c7bf39ff2c400deecfd41b216e52dfd1321d1c58 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 20:59:25 -0400 Subject: [PATCH 33/49] Fix testdata tuple bug in test_orc --- pandas/tests/io/test_orc.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index d0fe5325a1c44..bac17c2f88584 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -275,24 +275,20 @@ def test_orc_roundtrip_bytesio(): testdata = [ - (pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")})), - (pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")})), - ( - pd.DataFrame( - {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]} - ), + pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), + pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), + pd.DataFrame( + {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]} ), - ( - pd.DataFrame( - { - "unimpl": [ - pd.Period("2022-01-03", freq="D"), - pd.Period("2022-01-04", freq="D"), - ] - } - ), + pd.DataFrame( + { + "unimpl": [ + pd.Period("2022-01-03", freq="D"), + pd.Period("2022-01-04", freq="D"), + ] + } ), - (pd.DataFrame({"unimpl": [np.nan] * 100}).astype(pd.SparseDtype("float", np.nan)),), + pd.DataFrame({"unimpl": [np.nan] * 100}).astype(pd.SparseDtype("float", np.nan)), ] From e43c6dd73bc2ac30aac771a4924dc0568ceccd28 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 21:28:39 -0400 Subject: [PATCH 34/49] Fix docstrings for check compliance --- pandas/core/frame.py | 2 +- pandas/io/orc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 20b130191e0b9..4682ac4878bca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2936,7 +2936,7 @@ def to_orc( doesn't require much space and is faster. Other indexes will be included as columns in the file output. **kwargs - Additional keyword arguments passed to the engine + Additional keyword arguments passed to the engine. Returns ------- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index bc14a90d463cf..51b29ce8144e3 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -98,7 +98,7 @@ def to_orc( doesn't require much space and is faster. Other indexes will be included as columns in the file output. **kwargs - Additional keyword arguments passed to the engine + Additional keyword arguments passed to the engine. Returns ------- From afa0a8a3735c082c8855e9e721caafc7e751922e Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 22:14:37 -0400 Subject: [PATCH 35/49] read_orc does not have engine as a param --- doc/source/user_guide/io.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a7f26e53620f8..973e978a1453f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5599,7 +5599,7 @@ Read from an orc file. .. ipython:: python :okwarning: - result = pd.read_orc("example_pa.orc", engine="pyarrow") + result = pd.read_orc("example_pa.orc") result.dtypes @@ -5609,7 +5609,6 @@ Read only certain columns of an orc file. result = pd.read_orc( "example_pa.orc", - engine="pyarrow", columns=["a", "b"], ) result.dtypes From cd585e678432c5359fab1bd07dd5a0277fdf0e6b Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 29 May 2022 23:54:54 -0400 Subject: [PATCH 36/49] Fix sphinx warnings --- pandas/core/frame.py | 16 ++++++++-------- pandas/io/orc.py | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4682ac4878bca..51c5c4a7d802a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2945,10 +2945,10 @@ def to_orc( Raises ------ NotImplementedError - * Dtype of one or more columns is unsigned integers, intervals, - periods, sparse or categorical. + Dtype of one or more columns is category, unsigned integers, interval, + period or sparse. ValueError - * engine is not pyarrow. + engine is not pyarrow. See Also -------- @@ -2961,13 +2961,13 @@ def to_orc( Notes ----- * Before using this function you should read the :ref:`user guide about - ORC ` and :ref:`install optional dependencies `. - * This function requires `pyarrow ` - _ library. + ORC ` and :ref:`install optional dependencies `. + * This function requires `pyarrow `_ + library. * Unsigned integers, intervals, periods, sparse and categorical Dtypes - are not supported yet. + are not supported yet. * Currently timezones in datetime columns are not preserved when a - dataframe is converted into ORC files. + dataframe is converted into ORC files. Examples -------- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 51b29ce8144e3..02f43855c4340 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -107,22 +107,22 @@ def to_orc( Raises ------ NotImplementedError - * Dtype of one or more columns is unsigned integers, intervals, - periods, sparse or categorical. + Dtype of one or more columns is category, unsigned integers, interval, + period or sparse. ValueError - * engine is not pyarrow. + engine is not pyarrow. Notes ----- * Before using this function you should read the - :ref:`user guide about ORC ` and - :ref:`install optional dependencies `. - * This function requires `pyarrow ` - _ library. + :ref:`user guide about ORC ` and + :ref:`install optional dependencies `. + * This function requires `pyarrow `_ + library. * Unsigned integers, intervals, periods, sparse and categorical Dtypes - are not supported yet. + are not supported yet. * Currently timezones in datetime columns are not preserved when a - dataframe is converted into ORC files. + dataframe is converted into ORC files. """ if index is None: index = df.index.names[0] is not None From b509c3c22c5be1eaba6af400cc585b71c4939d26 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Mon, 30 May 2022 17:56:30 -0400 Subject: [PATCH 37/49] Improve docs & rerun tests --- pandas/core/frame.py | 2 +- pandas/io/orc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 51c5c4a7d802a..38ee793f283cd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2964,7 +2964,7 @@ def to_orc( ORC ` and :ref:`install optional dependencies `. * This function requires `pyarrow `_ library. - * Unsigned integers, intervals, periods, sparse and categorical Dtypes + * Category, unsigned integers, interval, period and sparse Dtypes are not supported yet. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 02f43855c4340..655753e22cd05 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -119,7 +119,7 @@ def to_orc( :ref:`install optional dependencies `. * This function requires `pyarrow `_ library. - * Unsigned integers, intervals, periods, sparse and categorical Dtypes + * Category, unsigned integers, interval, period and sparse Dtypes are not supported yet. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. From 1001002907fa8892d65f3ab7fe7df5a1cd8d7d00 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Mon, 30 May 2022 19:37:22 -0400 Subject: [PATCH 38/49] Force retrigger --- pandas/tests/io/test_orc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index bac17c2f88584..d09ecf6207926 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -288,7 +288,7 @@ def test_orc_roundtrip_bytesio(): ] } ), - pd.DataFrame({"unimpl": [np.nan] * 100}).astype(pd.SparseDtype("float", np.nan)), + pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)), ] From 55cab6ee3551eb3efd68783dab267299e889b993 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Tue, 7 Jun 2022 02:07:30 -0400 Subject: [PATCH 39/49] Fix test_orc according to review --- pandas/tests/io/test_orc.py | 48 ++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index d09ecf6207926..5364b25b4a61f 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -6,6 +6,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -22,6 +24,26 @@ def dirpath(datapath): return datapath("io", "data", "orc") +# Examples of dataframes with dtypes for which conversion to ORC +# hasn't been implemented yet. +orc_writer_not_implemented = [ + pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), + pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), + pd.DataFrame( + {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]} + ), + pd.DataFrame( + { + "unimpl": [ + pd.Period("2022-01-03", freq="D"), + pd.Period("2022-01-04", freq="D"), + ] + } + ), + pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)), +] + + def test_orc_reader_empty(dirpath): columns = [ "boolean1", @@ -227,10 +249,10 @@ def test_orc_reader_snappy_compressed(dirpath): tm.assert_equal(expected, got) +@td.skip_if_no("pyarrow", min_version="7.0.0") def test_orc_roundtrip_file(dirpath): # GH44554 # PyArrow gained ORC write support with the current argument order - pytest.importorskip("pyarrow", minversion="7.0.0") data = { "boolean1": np.array([False, True], dtype="bool"), "byte1": np.array([1, 100], dtype="int8"), @@ -251,10 +273,10 @@ def test_orc_roundtrip_file(dirpath): tm.assert_equal(expected, got) +@td.skip_if_no("pyarrow", min_version="7.0.0") def test_orc_roundtrip_bytesio(): # GH44554 # PyArrow gained ORC write support with the current argument order - pytest.importorskip("pyarrow", minversion="7.0.0") data = { "boolean1": np.array([False, True], dtype="bool"), "byte1": np.array([1, 100], dtype="int8"), @@ -274,29 +296,11 @@ def test_orc_roundtrip_bytesio(): tm.assert_equal(expected, got) -testdata = [ - pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), - pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), - pd.DataFrame( - {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]} - ), - pd.DataFrame( - { - "unimpl": [ - pd.Period("2022-01-03", freq="D"), - pd.Period("2022-01-04", freq="D"), - ] - } - ), - pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)), -] - - -@pytest.mark.parametrize("unimplemented", testdata) +@td.skip_if_no("pyarrow", min_version="7.0.0") +@pytest.mark.parametrize("unimplemented", orc_writer_not_implemented) def test_orc_writer_unimplemented_dtypes(unimplemented): # GH44554 # PyArrow gained ORC write support with the current argument order - pytest.importorskip("pyarrow", minversion="7.0.0") msg = """The dtype of one or more columns is unsigned integers, intervals, periods, sparse or categorical which is not supported yet.""" with pytest.raises(NotImplementedError, match=msg): From 89283e0d2e7a9bf80894e0edd07b6d3aeeafe6c8 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Tue, 7 Jun 2022 08:46:02 -0400 Subject: [PATCH 40/49] Rename some variables and func --- pandas/tests/io/test_orc.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 5364b25b4a61f..932918c75dec5 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -25,8 +25,9 @@ def dirpath(datapath): # Examples of dataframes with dtypes for which conversion to ORC -# hasn't been implemented yet. -orc_writer_not_implemented = [ +# hasn't been implemented yet, that is, Category, unsigned integers, +# interval, period and sparse. +orc_writer_dtypes_not_supported = [ pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), pd.DataFrame( @@ -297,11 +298,11 @@ def test_orc_roundtrip_bytesio(): @td.skip_if_no("pyarrow", min_version="7.0.0") -@pytest.mark.parametrize("unimplemented", orc_writer_not_implemented) -def test_orc_writer_unimplemented_dtypes(unimplemented): +@pytest.mark.parametrize("df_not_supported", orc_writer_dtypes_not_supported) +def test_orc_writer_dtypes_not_supported(df_not_supported): # GH44554 # PyArrow gained ORC write support with the current argument order msg = """The dtype of one or more columns is unsigned integers, intervals, periods, sparse or categorical which is not supported yet.""" with pytest.raises(NotImplementedError, match=msg): - unimplemented.to_orc() + df_not_supported.to_orc() From 989468a4b66637df12337ae0846d858fab06a0d0 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Tue, 7 Jun 2022 12:11:35 -0400 Subject: [PATCH 41/49] Update pandas/core/frame.py Co-authored-by: Matthew Roeschke --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 38ee793f283cd..364fda763c718 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2923,7 +2923,7 @@ def to_orc( we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. - engine : {{'pyarrow'}}, default 'pyarrow' + engine : str, default 'pyarrow' ORC library to use, or library it self, checked with 'pyarrow' name and version >= 7.0.0. Raises ValueError if it is anything but 'pyarrow'. From a7fca36785c8e7a747f0d4d6e6031706b5e43e58 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sat, 11 Jun 2022 23:58:24 -0400 Subject: [PATCH 42/49] Fix issues according to review --- doc/source/user_guide/io.rst | 5 +++-- doc/source/whatsnew/v1.5.0.rst | 22 ++++++++++++++++++++++ pandas/core/frame.py | 7 +++---- pandas/io/orc.py | 4 ++-- pandas/tests/io/test_orc.py | 8 ++++---- 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 973e978a1453f..4c5d189e1bba3 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5568,8 +5568,9 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This .. warning:: * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. - * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. - * Unsigned integers, intervals, periods, sparse and categorical Dtypes are not supported yet. + * :func:`~pandas.DataFrame.to_orc` requires pyarrow>=7.0.0. + * :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc` are not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. + * For supported dtypes please refer to `supported ORC features in Arrow `__. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. .. ipython:: python diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8a7ad077c2a90..2719d415dedc0 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -100,6 +100,28 @@ as seen in the following example. 1 2021-01-02 08:00:00 4 2 2021-01-02 16:00:00 5 +.. _whatsnew_150.enhancements.orc: + +Writing to ORC files +^^^^^^^^^^^^^^^^^^^^ + +The new method :meth:`DataFrame.to_orc` allows writing to ORC files (:issue:`43864`). + +This functionality depends the `pyarrow `__ library. For more details, see :ref:`the IO docs on ORC `. + +.. warning:: + + * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. + * :func:`~pandas.DataFrame.to_orc` requires pyarrow>=7.0.0. + * :func:`~pandas.DataFrame.to_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. + * For supported dtypes please refer to `supported ORC features in Arrow `__. + * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. + +.. code-block:: python + + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + df.to_orc("./out.orc") + .. _whatsnew_150.enhancements.tar: Reading directly from TAR archives diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 364fda763c718..6626b7dcad24d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2925,8 +2925,7 @@ def to_orc( a bytes object is returned. engine : str, default 'pyarrow' ORC library to use, or library it self, checked with 'pyarrow' name - and version >= 7.0.0. Raises ValueError if it is anything but - 'pyarrow'. + and version >= 7.0.0. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -2964,8 +2963,8 @@ def to_orc( ORC ` and :ref:`install optional dependencies `. * This function requires `pyarrow `_ library. - * Category, unsigned integers, interval, period and sparse Dtypes - are not supported yet. + * For supported dtypes please refer to + `this article `__. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 655753e22cd05..e679097ec3600 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -119,8 +119,8 @@ def to_orc( :ref:`install optional dependencies `. * This function requires `pyarrow `_ library. - * Category, unsigned integers, interval, period and sparse Dtypes - are not supported yet. + * For supported dtypes please refer to + `this article `__. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. """ diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 932918c75dec5..d5a0d4cc4fff0 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -267,11 +267,11 @@ def test_orc_roundtrip_file(dirpath): } expected = pd.DataFrame.from_dict(data) - outputfile = os.path.join(dirpath, "TestOrcFile.testReadWrite.orc") - expected.to_orc(outputfile) - got = read_orc(outputfile) + with tm.ensure_clean() as path: + expected.to_orc(path) + got = read_orc(path) - tm.assert_equal(expected, got) + tm.assert_equal(expected, got) @td.skip_if_no("pyarrow", min_version="7.0.0") From 7fc338c6ab8434651cf70b6e4821a0accd832e7c Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Sun, 12 Jun 2022 01:13:29 -0400 Subject: [PATCH 43/49] Forced reruns --- pandas/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6626b7dcad24d..8d6357cc9ad57 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2970,12 +2970,12 @@ def to_orc( Examples -------- - >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) >>> df.to_orc('df.orc') # doctest: +SKIP >>> pd.read_orc('df.orc') # doctest: +SKIP col1 col2 - 0 1 3 - 1 2 4 + 0 1 4 + 1 2 3 If you want to get a buffer to the orc content you can write it to io.BytesIO >>> import io From 91d15560330c00a0e6253d8ca76bb6f94f230fa4 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Mon, 13 Jun 2022 05:12:33 -0400 Subject: [PATCH 44/49] Fix issues according to review --- pandas/core/frame.py | 15 +++--- pandas/io/orc.py | 44 +++++++++++------- .../io/data/orc/TestOrcFile.testReadWrite.orc | Bin 1344 -> 0 bytes 3 files changed, 33 insertions(+), 26 deletions(-) delete mode 100644 pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8d6357cc9ad57..183a45e8dca05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2908,7 +2908,7 @@ def to_orc( *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, - **kwargs, + engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ Write a DataFrame to the ORC format. @@ -2924,8 +2924,7 @@ def to_orc( (e.g. via builtin open function). If path is None, a bytes object is returned. engine : str, default 'pyarrow' - ORC library to use, or library it self, checked with 'pyarrow' name - and version >= 7.0.0. + ORC library to use. Pyarrow must be >= 7.0.0. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -2934,8 +2933,8 @@ def to_orc( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - **kwargs - Additional keyword arguments passed to the engine. + engine_kwargs: dict[str, Any], optional + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns ------- @@ -2963,8 +2962,8 @@ def to_orc( ORC ` and :ref:`install optional dependencies `. * This function requires `pyarrow `_ library. - * For supported dtypes please refer to - `this article `__. + * For supported dtypes please refer to `supported ORC features in Arrow + `__. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. @@ -2986,7 +2985,7 @@ def to_orc( """ from pandas.io.orc import to_orc - return to_orc(self, path, engine=engine, index=index, **kwargs) + return to_orc(self, path, engine=engine, index=index, **engine_kwargs) @Substitution( header_type="bool", diff --git a/pandas/io/orc.py b/pandas/io/orc.py index e679097ec3600..793c0356894fb 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -5,6 +5,7 @@ from types import ModuleType from typing import ( TYPE_CHECKING, + Any, Literal, ) @@ -15,6 +16,14 @@ ) from pandas.compat._optional import import_optional_dependency +from pandas.core.dtypes.common import ( + is_categorical, + is_interval_dtype, + is_period_dtype, + is_sparse, + is_unsigned_integer_dtype, +) + from pandas.io.common import get_handle if TYPE_CHECKING: @@ -66,7 +75,7 @@ def to_orc( *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, - **kwargs, + engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ Write a DataFrame to the ORC format. @@ -85,10 +94,8 @@ def to_orc( we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. - engine : {{'pyarrow'}}, default 'pyarrow' - ORC library to use, or library it self, checked with 'pyarrow' name - and version >= 7.0.0. Raises ValueError if it is anything but - 'pyarrow'. + engine : str, default 'pyarrow' + ORC library to use. Pyarrow must be >= 7.0.0. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -97,8 +104,8 @@ def to_orc( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - **kwargs - Additional keyword arguments passed to the engine. + engine_kwargs: dict[str, Any], optional + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns ------- @@ -119,8 +126,8 @@ def to_orc( :ref:`install optional dependencies `. * This function requires `pyarrow `_ library. - * For supported dtypes please refer to - `this article `__. + * For supported dtypes please refer to `supported ORC features in Arrow + `__. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. """ @@ -128,18 +135,17 @@ def to_orc( index = df.index.names[0] is not None # If unsupported dtypes are found raise NotImplementedError + # In Pyarrow 9.0.0 this check will no longer be needed for dtype in df.dtypes: - dtype_str = dtype.__str__().lower() if ( - "category" in dtype_str - or "interval" in dtype_str - or "sparse" in dtype_str - or "period" in dtype_str - or "uint" in dtype_str + is_categorical(dtype) + or is_interval_dtype(dtype) + or is_period_dtype(dtype) + or is_sparse(dtype) + or is_unsigned_integer_dtype(dtype) ): raise NotImplementedError( - """The dtype of one or more columns is unsigned integers, -intervals, periods, sparse or categorical which is not supported yet.""" + """The dtype of one or more columns is not supported yet.""" ) if engine != "pyarrow": @@ -154,7 +160,9 @@ def to_orc( with get_handle(path, "wb", is_text=False) as handles: assert isinstance(engine, ModuleType) # For mypy orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs + engine.Table.from_pandas(df, preserve_index=index), + handles.handle, + **engine_kwargs, ) if was_none: diff --git a/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc b/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc deleted file mode 100644 index 852360ecad74cb3545fd160f895fca67d68ef276..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1344 zcma)6&5qJg6uv*b?P*25Q&BHP6C{}oYC<5Im~=B_CQ&y!X5!Ao1TrWjgeC(nW@8$k zz&FrGaI35CH1jkrTvHFXmkRfFi(;;QU=Mge zDVnzfQFzyYg#b+cqUwAes6}uc10PZ;)0#?Km{b=@h)D_36ePY?no10J7vUWYZ^0}# z4=R@w%kBq`?&Kb@sOswu$nX7~f{;$7Akev>w3(UGa%M^&XIL8AE5s`~U=W~JwcZY} zI^K}pSwCchp~5iF@Y%_;89m9fw-ks(YfKp-_{y`7)HJ_neU@j0r2E($eyC`%X|SB- z*;Bl4lyN7|4(PYR5WgsJ!IeZ^)kxJ%jZ9TEausQG)Lo52?P+w?Lyew6P-U%sU57m? zhMGvKks2hD8O;>eIX9XsS8K;;M}gMDXrb`F;rU#nU6Xr8drImYv}O|Y`6;5%zHfZr z_`VXjaoFeDk!ff3xLRZVh(CMrI`~P0Vry4+b$vxQHgnrTUcQI_v45*27wp|~+ctL$ zxcnia62e!ey0!1D!Rl|I6#DZ`5v~V*1id5~^}FMCBzkAlemim}gXDDDj-25bk7AUJ z-=q0%lyp&gC&`E2XkIlho}e+Ao({31MjE6=>Hx$tY8kVPTjp3MEORXb%e;cMSgc19 zFGmuujU-+lNxWqw@uee)SMT@szl48EwvQyXBZ)Uh()s-IIqivd^GWj&eBtiuK SNU3p7TkfYnoF5$@R{jEaEZds^ From a28c5a8786697f990612efaae8beb46e00871944 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Mon, 13 Jun 2022 05:26:51 -0400 Subject: [PATCH 45/49] Reraise Pyarrow TypeError as NotImplementedError --- pandas/io/orc.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 793c0356894fb..078b9c7a9af84 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -20,7 +20,6 @@ is_categorical, is_interval_dtype, is_period_dtype, - is_sparse, is_unsigned_integer_dtype, ) @@ -141,7 +140,6 @@ def to_orc( is_categorical(dtype) or is_interval_dtype(dtype) or is_period_dtype(dtype) - or is_sparse(dtype) or is_unsigned_integer_dtype(dtype) ): raise NotImplementedError( @@ -159,11 +157,16 @@ def to_orc( assert path is not None # For mypy with get_handle(path, "wb", is_text=False) as handles: assert isinstance(engine, ModuleType) # For mypy - orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), - handles.handle, - **engine_kwargs, - ) + try: + orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + handles.handle, + **engine_kwargs, + ) + except TypeError as e: + raise NotImplementedError( + """The dtype of one or more columns is not supported yet.""" + ) from e if was_none: assert isinstance(path, io.BytesIO) # For mypy From 162e5bb36461b47248b4a8d47555e05994658b29 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Mon, 13 Jun 2022 06:02:05 -0400 Subject: [PATCH 46/49] Fix bugs --- pandas/core/frame.py | 4 ++-- pandas/io/orc.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 183a45e8dca05..fd853d6603a2c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2908,7 +2908,7 @@ def to_orc( *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, - engine_kwargs: dict[str, Any] | None = None, + engine_kwargs: dict[str, Any] = {}, ) -> bytes | None: """ Write a DataFrame to the ORC format. @@ -2933,7 +2933,7 @@ def to_orc( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - engine_kwargs: dict[str, Any], optional + engine_kwargs : dict[str, Any], default {} Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 078b9c7a9af84..b80c6635776c9 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -74,7 +74,7 @@ def to_orc( *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, - engine_kwargs: dict[str, Any] | None = None, + engine_kwargs: dict[str, Any] = {}, ) -> bytes | None: """ Write a DataFrame to the ORC format. @@ -103,7 +103,7 @@ def to_orc( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - engine_kwargs: dict[str, Any], optional + engine_kwargs : dict[str, Any], default {} Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns From b23058350fe42a239c5d58cd47fa92069be82c03 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Mon, 13 Jun 2022 06:12:57 -0400 Subject: [PATCH 47/49] Fix expected error msg in orc tests --- pandas/io/orc.py | 4 ++-- pandas/tests/io/test_orc.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index b80c6635776c9..36bc72fa4c936 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -143,7 +143,7 @@ def to_orc( or is_unsigned_integer_dtype(dtype) ): raise NotImplementedError( - """The dtype of one or more columns is not supported yet.""" + "The dtype of one or more columns is not supported yet." ) if engine != "pyarrow": @@ -165,7 +165,7 @@ def to_orc( ) except TypeError as e: raise NotImplementedError( - """The dtype of one or more columns is not supported yet.""" + "The dtype of one or more columns is not supported yet." ) from e if was_none: diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index d5a0d4cc4fff0..0bb320907b813 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -302,7 +302,6 @@ def test_orc_roundtrip_bytesio(): def test_orc_writer_dtypes_not_supported(df_not_supported): # GH44554 # PyArrow gained ORC write support with the current argument order - msg = """The dtype of one or more columns is unsigned integers, -intervals, periods, sparse or categorical which is not supported yet.""" + msg = "The dtype of one or more columns is not supported yet." with pytest.raises(NotImplementedError, match=msg): df_not_supported.to_orc() From e16edabf733255f23991017be58e98db036f4204 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Mon, 13 Jun 2022 06:22:13 -0400 Subject: [PATCH 48/49] Avoid deprecated functions --- pandas/io/orc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 36bc72fa4c936..5e1d3f7c86b23 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -17,7 +17,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import ( - is_categorical, + is_categorical_dtype, is_interval_dtype, is_period_dtype, is_unsigned_integer_dtype, @@ -137,7 +137,7 @@ def to_orc( # In Pyarrow 9.0.0 this check will no longer be needed for dtype in df.dtypes: if ( - is_categorical(dtype) + is_categorical_dtype(dtype) or is_interval_dtype(dtype) or is_period_dtype(dtype) or is_unsigned_integer_dtype(dtype) From e4770b8cd6c49cf88c63931adbec40a747a55b84 Mon Sep 17 00:00:00 2001 From: Ian Joiner Date: Mon, 13 Jun 2022 17:34:47 -0400 Subject: [PATCH 49/49] Replace {} with None in arg --- pandas/core/frame.py | 8 +++++--- pandas/io/orc.py | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fd853d6603a2c..00cfd0e0f8fd7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2908,7 +2908,7 @@ def to_orc( *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, - engine_kwargs: dict[str, Any] = {}, + engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ Write a DataFrame to the ORC format. @@ -2933,7 +2933,7 @@ def to_orc( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - engine_kwargs : dict[str, Any], default {} + engine_kwargs : dict[str, Any] or None, default None Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns @@ -2985,7 +2985,9 @@ def to_orc( """ from pandas.io.orc import to_orc - return to_orc(self, path, engine=engine, index=index, **engine_kwargs) + return to_orc( + self, path, engine=engine, index=index, engine_kwargs=engine_kwargs + ) @Substitution( header_type="bool", diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 5e1d3f7c86b23..40754a56bbe8b 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -74,7 +74,7 @@ def to_orc( *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, - engine_kwargs: dict[str, Any] = {}, + engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ Write a DataFrame to the ORC format. @@ -103,7 +103,7 @@ def to_orc( the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. - engine_kwargs : dict[str, Any], default {} + engine_kwargs : dict[str, Any] or None, default None Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns @@ -132,6 +132,8 @@ def to_orc( """ if index is None: index = df.index.names[0] is not None + if engine_kwargs is None: + engine_kwargs = {} # If unsupported dtypes are found raise NotImplementedError # In Pyarrow 9.0.0 this check will no longer be needed