diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index ea27d1efbb235..e71ee80767d29 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -373,6 +373,7 @@ Serialization / IO / conversion DataFrame.from_dict DataFrame.from_records + DataFrame.to_orc DataFrame.to_parquet DataFrame.to_pickle DataFrame.to_csv diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 70fd381bffd2c..425b5f81be966 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -159,6 +159,7 @@ ORC :toctree: api/ read_orc + DataFrame.to_orc SAS ~~~ diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 4e19deb84487f..4c5d189e1bba3 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -30,7 +30,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`ORC Format `__;:ref:`read_orc`; + binary;`ORC Format `__;:ref:`read_orc`;:ref:`to_orc` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`SPSS `__;:ref:`read_spss`; @@ -5562,13 +5562,64 @@ ORC .. versionadded:: 1.0.0 Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization -for data frames. It is designed to make reading data frames efficient. pandas provides *only* a reader for the -ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. +for data frames. It is designed to make reading data frames efficient. pandas provides both the reader and the writer for the +ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This requires the `pyarrow `__ library. .. warning:: * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. - * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. + * :func:`~pandas.DataFrame.to_orc` requires pyarrow>=7.0.0. + * :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc` are not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. + * For supported dtypes please refer to `supported ORC features in Arrow `__. + * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. + +.. ipython:: python + + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(4.0, 7.0, dtype="float64"), + "d": [True, False, True], + "e": pd.date_range("20130101", periods=3), + } + ) + + df + df.dtypes + +Write to an orc file. + +.. ipython:: python + :okwarning: + + df.to_orc("example_pa.orc", engine="pyarrow") + +Read from an orc file. + +.. ipython:: python + :okwarning: + + result = pd.read_orc("example_pa.orc") + + result.dtypes + +Read only certain columns of an orc file. + +.. ipython:: python + + result = pd.read_orc( + "example_pa.orc", + columns=["a", "b"], + ) + result.dtypes + + +.. ipython:: python + :suppress: + + os.remove("example_pa.orc") + .. _io.sql: diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8a7ad077c2a90..2719d415dedc0 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -100,6 +100,28 @@ as seen in the following example. 1 2021-01-02 08:00:00 4 2 2021-01-02 16:00:00 5 +.. _whatsnew_150.enhancements.orc: + +Writing to ORC files +^^^^^^^^^^^^^^^^^^^^ + +The new method :meth:`DataFrame.to_orc` allows writing to ORC files (:issue:`43864`). + +This functionality depends the `pyarrow `__ library. For more details, see :ref:`the IO docs on ORC `. + +.. warning:: + + * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. + * :func:`~pandas.DataFrame.to_orc` requires pyarrow>=7.0.0. + * :func:`~pandas.DataFrame.to_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. + * For supported dtypes please refer to `supported ORC features in Arrow `__. + * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. + +.. code-block:: python + + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + df.to_orc("./out.orc") + .. _whatsnew_150.enhancements.tar: Reading directly from TAR archives diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 54ee5ed2f35d1..00cfd0e0f8fd7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2858,6 +2858,7 @@ def to_parquet( See Also -------- read_parquet : Read a parquet file. + DataFrame.to_orc : Write an orc file. DataFrame.to_csv : Write a csv file. DataFrame.to_sql : Write to a sql table. DataFrame.to_hdf : Write to hdf. @@ -2901,6 +2902,93 @@ def to_parquet( **kwargs, ) + def to_orc( + self, + path: FilePath | WriteBuffer[bytes] | None = None, + *, + engine: Literal["pyarrow"] = "pyarrow", + index: bool | None = None, + engine_kwargs: dict[str, Any] | None = None, + ) -> bytes | None: + """ + Write a DataFrame to the ORC format. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + path : str, file-like object or None, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function). If path is None, + a bytes object is returned. + engine : str, default 'pyarrow' + ORC library to use. Pyarrow must be >= 7.0.0. + index : bool, optional + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + engine_kwargs : dict[str, Any] or None, default None + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. + + Returns + ------- + bytes if no path argument is provided else None + + Raises + ------ + NotImplementedError + Dtype of one or more columns is category, unsigned integers, interval, + period or sparse. + ValueError + engine is not pyarrow. + + See Also + -------- + read_orc : Read a ORC file. + DataFrame.to_parquet : Write a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + * Before using this function you should read the :ref:`user guide about + ORC ` and :ref:`install optional dependencies `. + * This function requires `pyarrow `_ + library. + * For supported dtypes please refer to `supported ORC features in Arrow + `__. + * Currently timezones in datetime columns are not preserved when a + dataframe is converted into ORC files. + + Examples + -------- + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) + >>> df.to_orc('df.orc') # doctest: +SKIP + >>> pd.read_orc('df.orc') # doctest: +SKIP + col1 col2 + 0 1 4 + 1 2 3 + + If you want to get a buffer to the orc content you can write it to io.BytesIO + >>> import io + >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP + >>> b.seek(0) # doctest: +SKIP + 0 + >>> content = b.read() # doctest: +SKIP + """ + from pandas.io.orc import to_orc + + return to_orc( + self, path, engine=engine, index=index, engine_kwargs=engine_kwargs + ) + @Substitution( header_type="bool", header="Whether to print column labels, default True", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 89a590f291356..78edaf15fe7ce 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2629,6 +2629,7 @@ def to_hdf( See Also -------- read_hdf : Read from HDF file. + DataFrame.to_orc : Write a DataFrame to the binary orc format. DataFrame.to_parquet : Write a DataFrame to the binary parquet format. DataFrame.to_sql : Write to a SQL table. DataFrame.to_feather : Write out feather-format for DataFrames. diff --git a/pandas/io/orc.py b/pandas/io/orc.py index b02660c089382..40754a56bbe8b 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,14 +1,28 @@ """ orc compat """ from __future__ import annotations -from typing import TYPE_CHECKING +import io +from types import ModuleType +from typing import ( + TYPE_CHECKING, + Any, + Literal, +) from pandas._typing import ( FilePath, ReadBuffer, + WriteBuffer, ) from pandas.compat._optional import import_optional_dependency +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_interval_dtype, + is_period_dtype, + is_unsigned_integer_dtype, +) + from pandas.io.common import get_handle if TYPE_CHECKING: @@ -52,3 +66,111 @@ def read_orc( with get_handle(path, "rb", is_text=False) as handles: orc_file = orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() + + +def to_orc( + df: DataFrame, + path: FilePath | WriteBuffer[bytes] | None = None, + *, + engine: Literal["pyarrow"] = "pyarrow", + index: bool | None = None, + engine_kwargs: dict[str, Any] | None = None, +) -> bytes | None: + """ + Write a DataFrame to the ORC format. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + df : DataFrame + The dataframe to be written to ORC. Raises NotImplementedError + if dtype of one or more columns is category, unsigned integers, + intervals, periods or sparse. + path : str, file-like object or None, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function). If path is None, + a bytes object is returned. + engine : str, default 'pyarrow' + ORC library to use. Pyarrow must be >= 7.0.0. + index : bool, optional + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + engine_kwargs : dict[str, Any] or None, default None + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. + + Returns + ------- + bytes if no path argument is provided else None + + Raises + ------ + NotImplementedError + Dtype of one or more columns is category, unsigned integers, interval, + period or sparse. + ValueError + engine is not pyarrow. + + Notes + ----- + * Before using this function you should read the + :ref:`user guide about ORC ` and + :ref:`install optional dependencies `. + * This function requires `pyarrow `_ + library. + * For supported dtypes please refer to `supported ORC features in Arrow + `__. + * Currently timezones in datetime columns are not preserved when a + dataframe is converted into ORC files. + """ + if index is None: + index = df.index.names[0] is not None + if engine_kwargs is None: + engine_kwargs = {} + + # If unsupported dtypes are found raise NotImplementedError + # In Pyarrow 9.0.0 this check will no longer be needed + for dtype in df.dtypes: + if ( + is_categorical_dtype(dtype) + or is_interval_dtype(dtype) + or is_period_dtype(dtype) + or is_unsigned_integer_dtype(dtype) + ): + raise NotImplementedError( + "The dtype of one or more columns is not supported yet." + ) + + if engine != "pyarrow": + raise ValueError("engine must be 'pyarrow'") + engine = import_optional_dependency(engine, min_version="7.0.0") + orc = import_optional_dependency("pyarrow.orc") + + was_none = path is None + if was_none: + path = io.BytesIO() + assert path is not None # For mypy + with get_handle(path, "wb", is_text=False) as handles: + assert isinstance(engine, ModuleType) # For mypy + try: + orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + handles.handle, + **engine_kwargs, + ) + except TypeError as e: + raise NotImplementedError( + "The dtype of one or more columns is not supported yet." + ) from e + + if was_none: + assert isinstance(path, io.BytesIO) # For mypy + return path.getvalue() + return None diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index f34e9b940317d..0bb320907b813 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -1,10 +1,13 @@ """ test orc compat """ import datetime +from io import BytesIO import os import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -21,6 +24,27 @@ def dirpath(datapath): return datapath("io", "data", "orc") +# Examples of dataframes with dtypes for which conversion to ORC +# hasn't been implemented yet, that is, Category, unsigned integers, +# interval, period and sparse. +orc_writer_dtypes_not_supported = [ + pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), + pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), + pd.DataFrame( + {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]} + ), + pd.DataFrame( + { + "unimpl": [ + pd.Period("2022-01-03", freq="D"), + pd.Period("2022-01-04", freq="D"), + ] + } + ), + pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)), +] + + def test_orc_reader_empty(dirpath): columns = [ "boolean1", @@ -224,3 +248,60 @@ def test_orc_reader_snappy_compressed(dirpath): got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +def test_orc_roundtrip_file(dirpath): + # GH44554 + # PyArrow gained ORC write support with the current argument order + data = { + "boolean1": np.array([False, True], dtype="bool"), + "byte1": np.array([1, 100], dtype="int8"), + "short1": np.array([1024, 2048], dtype="int16"), + "int1": np.array([65536, 65536], dtype="int32"), + "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), + "float1": np.array([1.0, 2.0], dtype="float32"), + "double1": np.array([-15.0, -5.0], dtype="float64"), + "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + "string1": np.array(["hi", "bye"], dtype="object"), + } + expected = pd.DataFrame.from_dict(data) + + with tm.ensure_clean() as path: + expected.to_orc(path) + got = read_orc(path) + + tm.assert_equal(expected, got) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +def test_orc_roundtrip_bytesio(): + # GH44554 + # PyArrow gained ORC write support with the current argument order + data = { + "boolean1": np.array([False, True], dtype="bool"), + "byte1": np.array([1, 100], dtype="int8"), + "short1": np.array([1024, 2048], dtype="int16"), + "int1": np.array([65536, 65536], dtype="int32"), + "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), + "float1": np.array([1.0, 2.0], dtype="float32"), + "double1": np.array([-15.0, -5.0], dtype="float64"), + "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + "string1": np.array(["hi", "bye"], dtype="object"), + } + expected = pd.DataFrame.from_dict(data) + + bytes = expected.to_orc() + got = read_orc(BytesIO(bytes)) + + tm.assert_equal(expected, got) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +@pytest.mark.parametrize("df_not_supported", orc_writer_dtypes_not_supported) +def test_orc_writer_dtypes_not_supported(df_not_supported): + # GH44554 + # PyArrow gained ORC write support with the current argument order + msg = "The dtype of one or more columns is not supported yet." + with pytest.raises(NotImplementedError, match=msg): + df_not_supported.to_orc()