pandas-dev · mroeschke · Jun 14, 2022 · Oct 3, 2021 · Oct 3, 2021 · Oct 3, 2021
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -373,6 +373,7 @@ Serialization / IO / conversion
 
    DataFrame.from_dict
    DataFrame.from_records
+   DataFrame.to_orc
    DataFrame.to_parquet
    DataFrame.to_pickle
    DataFrame.to_csv

diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
@@ -159,6 +159,7 @@ ORC
    :toctree: api/
 
    read_orc
+   DataFrame.to_orc
 
 SAS
 ~~~

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -30,7 +30,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
     binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
     binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
     binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
-    binary;`ORC Format <https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;
+    binary;`ORC Format <https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;:ref:`to_orc<io.orc>`
     binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
     binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
     binary;`SPSS <https://en.wikipedia.org/wiki/SPSS>`__;:ref:`read_spss<io.spss_reader>`;
@@ -5562,13 +5562,63 @@ ORC
 .. versionadded:: 1.0.0
 
 Similar to the :ref:`parquet <io.parquet>` format, the `ORC Format <https://orc.apache.org/>`__ is a binary columnar serialization
-for data frames. It is designed to make reading data frames efficient. pandas provides *only* a reader for the
-ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow <https://arrow.apache.org/docs/python/>`__ library.
+for data frames. It is designed to make reading data frames efficient. pandas provides both the reader and the writer for the
+ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This requires the `pyarrow <https://arrow.apache.org/docs/python/>`__ library.
 
 .. warning::
 
    * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
    * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies <install.warn_orc>`.
+   * Unsigned integers, intervals, periods, sparse and categorical Dtypes are not supported yet.
+   * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files.
+
+.. ipython:: python
+
+   df = pd.DataFrame(
+       {
+           "a": list("abc"),
+           "b": list(range(1, 4)),
+           "c": np.arange(4.0, 7.0, dtype="float64"),
+           "d": [True, False, True],
+           "e": pd.date_range("20130101", periods=3),
+       }
+   )
+
+   df
+   df.dtypes
+
+Write to an orc file.
+
+.. ipython:: python
+   :okwarning:
+
+   df.to_orc("example_pa.orc", engine="pyarrow")
+
+Read from an orc file.
+
+.. ipython:: python
+   :okwarning:
+
+   result = pd.read_orc("example_pa.orc")
+
+   result.dtypes
+
+Read only certain columns of an orc file.
+
+.. ipython:: python
+
+   result = pd.read_orc(
+       "example_pa.orc",
+       columns=["a", "b"],
+   )
+   result.dtypes
+
+
+.. ipython:: python
+   :suppress:
+
+   os.remove("example_pa.orc")
+
 
 .. _io.sql:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2858,6 +2858,7 @@ def to_parquet(
         See Also
         --------
         read_parquet : Read a parquet file.
+        DataFrame.to_orc : Write an orc file.
         DataFrame.to_csv : Write a csv file.
         DataFrame.to_sql : Write to a sql table.
         DataFrame.to_hdf : Write to hdf.
@@ -2901,6 +2902,93 @@ def to_parquet(
             **kwargs,
         )
 
+    def to_orc(
+        self,
+        path: FilePath | WriteBuffer[bytes] | None = None,
+        *,
+        engine: Literal["pyarrow"] = "pyarrow",
+        index: bool | None = None,
+        **kwargs,
+    ) -> bytes | None:
+        """
+        Write a DataFrame to the ORC format.
+
+        .. versionadded:: 1.5.0
+
+        Parameters
+        ----------
+        path : str, file-like object or None, default None
+            If a string, it will be used as Root Directory path
+            when writing a partitioned dataset. By file-like object,
+            we refer to objects with a write() method, such as a file handle
+            (e.g. via builtin open function). If path is None,
+            a bytes object is returned.
+        engine : {{'pyarrow'}}, default 'pyarrow'
+            ORC library to use, or library it self, checked with 'pyarrow' name
-            ORC library to use, or library it self, checked with 'pyarrow' name
+            ORC library to use. Pyarrow must be >= 7.0.0.
-            ORC library to use, or library it self, checked with 'pyarrow' name
+            ORC library to use. Pyarrow must be >= 7.0.0.
+            and version >= 7.0.0. Raises ValueError if it is anything but
+            'pyarrow'.
+        index : bool, optional
+            If ``True``, include the dataframe's index(es) in the file output.
+            If ``False``, they will not be written to the file.
+            If ``None``, similar to ``infer`` the dataframe's index(es)
+            will be saved. However, instead of being saved as values,
+            the RangeIndex will be stored as a range in the metadata so it
+            doesn't require much space and is faster. Other indexes will
+            be included as columns in the file output.
+        **kwargs
+            Additional keyword arguments passed to the engine.
+
+        Returns
+        -------
+        bytes if no path argument is provided else None
+
+        Raises
+        ------
+        NotImplementedError
+            Dtype of one or more columns is category, unsigned integers, interval,
+            period or sparse.
+        ValueError
+            engine is not pyarrow.
+
+        See Also
+        --------
+        read_orc : Read a ORC file.
+        DataFrame.to_parquet : Write a parquet file.
+        DataFrame.to_csv : Write a csv file.
+        DataFrame.to_sql : Write to a sql table.
+        DataFrame.to_hdf : Write to hdf.
+
+        Notes
+        -----
+        * Before using this function you should read the :ref:`user guide about
+          ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
+        * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
+          library.
+        * Category, unsigned integers, interval, period and sparse Dtypes
+          are not supported yet.
+        * Currently timezones in datetime columns are not preserved when a
+          dataframe is converted into ORC files.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
+        >>> df.to_orc('df.orc')  # doctest: +SKIP
+        >>> pd.read_orc('df.orc')  # doctest: +SKIP
+           col1  col2
+        0     1     3
+        1     2     4
+
+        If you want to get a buffer to the orc content you can write it to io.BytesIO
+        >>> import io
+        >>> b = io.BytesIO(df.to_orc())  # doctest: +SKIP
+        >>> b.seek(0)  # doctest: +SKIP
+        0
+        >>> content = b.read()  # doctest: +SKIP
+        """
+        from pandas.io.orc import to_orc
+
+        return to_orc(self, path, engine=engine, index=index, **kwargs)
+
     @Substitution(
         header_type="bool",
         header="Whether to print column labels, default True",

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2629,6 +2629,7 @@ def to_hdf(
         See Also
         --------
         read_hdf : Read from HDF file.
+        DataFrame.to_orc : Write a DataFrame to the binary orc format.
         DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
         DataFrame.to_sql : Write to a SQL table.
         DataFrame.to_feather : Write out feather-format for DataFrames.

diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -1,11 +1,17 @@
 """ orc compat """
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+import io
+from types import ModuleType
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+)
 
 from pandas._typing import (
     FilePath,
     ReadBuffer,
+    WriteBuffer,
 )
 from pandas.compat._optional import import_optional_dependency
 
@@ -52,3 +58,106 @@ def read_orc(
     with get_handle(path, "rb", is_text=False) as handles:
         orc_file = orc.ORCFile(handles.handle)
         return orc_file.read(columns=columns, **kwargs).to_pandas()
+
+
+def to_orc(
+    df: DataFrame,
+    path: FilePath | WriteBuffer[bytes] | None = None,
+    *,
+    engine: Literal["pyarrow"] = "pyarrow",
+    index: bool | None = None,
+    **kwargs,
+) -> bytes | None:
+    """
+    Write a DataFrame to the ORC format.
+
+    .. versionadded:: 1.5.0
+
+    Parameters
+    ----------
+    df : DataFrame
+        The dataframe to be written to ORC. Raises NotImplementedError
+        if dtype of one or more columns is category, unsigned integers,
+        intervals, periods or sparse.
+    path : str, file-like object or None, default None
+        If a string, it will be used as Root Directory path
+        when writing a partitioned dataset. By file-like object,
+        we refer to objects with a write() method, such as a file handle
+        (e.g. via builtin open function). If path is None,
+        a bytes object is returned.
+    engine : {{'pyarrow'}}, default 'pyarrow'
+        ORC library to use, or library it self, checked with 'pyarrow' name
+        and version >= 7.0.0. Raises ValueError if it is anything but
+        'pyarrow'.
+    index : bool, optional
+        If ``True``, include the dataframe's index(es) in the file output. If
+        ``False``, they will not be written to the file.
+        If ``None``, similar to ``infer`` the dataframe's index(es)
+        will be saved. However, instead of being saved as values,
+        the RangeIndex will be stored as a range in the metadata so it
+        doesn't require much space and is faster. Other indexes will
+        be included as columns in the file output.
+    **kwargs
+        Additional keyword arguments passed to the engine.
+
+    Returns
+    -------
+    bytes if no path argument is provided else None
+
+    Raises
+    ------
+    NotImplementedError
+        Dtype of one or more columns is category, unsigned integers, interval,
+        period or sparse.
+    ValueError
+        engine is not pyarrow.
+
+    Notes
+    -----
+    * Before using this function you should read the
+      :ref:`user guide about ORC <io.orc>` and
+      :ref:`install optional dependencies <install.warn_orc>`.
+    * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
+      library.
+    * Category, unsigned integers, interval, period and sparse Dtypes
+      are not supported yet.
+    * Currently timezones in datetime columns are not preserved when a
+      dataframe is converted into ORC files.
+    """
+    if index is None:
+        index = df.index.names[0] is not None
+
+    # If unsupported dtypes are found raise NotImplementedError
+    for dtype in df.dtypes:
+        dtype_str = dtype.__str__().lower()
+        if (
+            "category" in dtype_str
+            or "interval" in dtype_str
+            or "sparse" in dtype_str
+            or "period" in dtype_str
+            or "uint" in dtype_str
+        ):
+            raise NotImplementedError(
+                """The dtype of one or more columns is unsigned integers,
+intervals, periods, sparse or categorical which is not supported yet."""
+            )
+
+    if engine != "pyarrow":
+        raise ValueError("engine must be 'pyarrow'")
+    engine = import_optional_dependency(engine, min_version="7.0.0")
+    orc = import_optional_dependency("pyarrow.orc")
+
+    was_none = path is None
+    if was_none:
+        path = io.BytesIO()
+    assert path is not None  # For mypy
+    with get_handle(path, "wb", is_text=False) as handles:
+        assert isinstance(engine, ModuleType)  # For mypy
+        orc.write_table(
+            engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs
+        )
+
+    if was_none:
+        assert isinstance(path, io.BytesIO)  # For mypy
+        return path.getvalue()
+    return None
diff --git a/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc b/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc
-Original file line number
+Diff line change
@@ Expand Up / @@ -159,6 +159,7 @@ ORC @@
        :toctree: api/
        read_orc
+       DataFrame.to_orc
     SAS
     ~~~
@@ Expand Down @@